diff --git a/app/services/bulk_upload/sales/year2023/csv_parser.rb b/app/services/bulk_upload/sales/year2023/csv_parser.rb new file mode 100644 index 000000000..ca24bf937 --- /dev/null +++ b/app/services/bulk_upload/sales/year2023/csv_parser.rb @@ -0,0 +1,91 @@ +require "csv" + +class BulkUpload::Sales::Year2023::CsvParser + MIN_COLUMNS = 135 + MAX_COLUMNS = 142 + + attr_reader :path + + def initialize(path:) + @path = path + end + + def row_offset + if with_headers? + rows.find_index { |row| row[0].match(/field number/i) } + 1 + else + 0 + end + end + + def col_offset + with_headers? ? 1 : 0 + end + + def cols + @cols ||= ("A".."EK").to_a + end + + def row_parsers + @row_parsers ||= body_rows.map do |row| + stripped_row = row[col_offset..] + hash = Hash[field_numbers.zip(stripped_row)] + + BulkUpload::Sales::Year2023::RowParser.new(hash) + end + end + + def body_rows + rows[row_offset..] + end + + def rows + @rows ||= CSV.parse(normalised_string, row_sep:) + end + + def column_for_field(field) + cols[field_numbers.find_index(field) + col_offset] + end + +private + + def default_field_numbers + [6, 3, 4, 5, nil, 28, 30, 38, 47, 51, 55, 59, 31, 39, 48, 52, 56, 60, 37, 46, 50, 54, 58, 35, 43, 49, 53, 57, 61, 32, 33, 78, 80, 79, 81, 83, 84, nil, 62, 66, 64, 65, 63, 67, 69, 70, 68, 76, 77, 16, 17, 18, 26, 24, 25, 27, 8, 91, 95, 96, 97, 92, 93, 94, 98, 100, 101, 103, 104, 106, 110, 111, 112, 113, 114, 9, 116, 117, 118, 120, 124, 125, 126, 10, 11, nil, 127, 129, 133, 134, 135, 1, 2, nil, 73, nil, 75, 107, 108, 121, 122, 130, 131, 82, 109, 123, 132, 115, 15, 86, 87, 29, 7, 12, 13, 14, 36, 44, 45, 88, 89, 102, 105, 119, 128, 19, 20, 21, 22, 23, 34, 40, 41, 42, 71, 72, 74, 85, 90, 99].map do |h| + if h.present? && h.to_s.match?(/^[0-9]+$/) + "field_#{h}" + else + "field_blank" + end + end + end + + def field_numbers + @field_numbers ||= if with_headers? + rows[row_offset - 1][col_offset..].map { |h| h.present? && h.match?(/^[0-9]+$/) ? "field_#{h}" : "field_blank" } + else + default_field_numbers + end + end + + def headers + @headers ||= ("field_1".."field_135").to_a + end + + def with_headers? + rows.map { |r| r[0] }.any? { |cell| cell&.match?(/field number/i) } + end + + def row_sep + "\n" + end + + def normalised_string + return @normalised_string if @normalised_string + + @normalised_string = File.read(path, encoding: "bom|utf-8") + @normalised_string.gsub!("\r\n", "\n") + @normalised_string.scrub!("") + + @normalised_string + end +end diff --git a/spec/services/bulk_upload/sales/year2023/csv_parser_spec.rb b/spec/services/bulk_upload/sales/year2023/csv_parser_spec.rb new file mode 100644 index 000000000..54af60e3d --- /dev/null +++ b/spec/services/bulk_upload/sales/year2023/csv_parser_spec.rb @@ -0,0 +1,148 @@ +require "rails_helper" + +RSpec.describe BulkUpload::Sales::Year2023::CsvParser do + subject(:service) { described_class.new(path:) } + + let(:file) { Tempfile.new } + let(:path) { file.path } + let(:log) { build(:sales_log, :completed, :with_uprn) } + + context "when parsing csv with headers" do + before do + file.write("Question\n") + file.write("Additional info\n") + file.write("Values\n") + file.write("Can be empty?\n") + file.write("Type of letting the question applies to\n") + file.write("Duplicate check field?\n") + file.write(BulkUpload::SalesLogToCsv.new(log:).default_2023_field_numbers_row) + file.write(BulkUpload::SalesLogToCsv.new(log:).to_2023_csv_row) + file.rewind + end + + it "returns correct offsets" do + expect(service.row_offset).to eq(7) + expect(service.col_offset).to eq(1) + end + + it "parses csv correctly" do + expect(service.row_parsers[0].field_19).to eql(log.uprn) + end + end + + context "when parsing csv with headers in arbitrary order" do + let(:seed) { rand } + + before do + file.write("Question\n") + file.write("Additional info\n") + file.write("Values\n") + file.write("Can be empty?\n") + file.write("Type of letting the question applies to\n") + file.write("Duplicate check field?\n") + file.write(BulkUpload::SalesLogToCsv.new(log:).default_2023_field_numbers_row(seed:)) + file.write(BulkUpload::SalesLogToCsv.new(log:).to_2023_csv_row(seed:)) + file.rewind + end + + it "returns correct offsets" do + expect(service.row_offset).to eq(7) + expect(service.col_offset).to eq(1) + end + + it "parses csv correctly" do + expect(service.row_parsers[0].field_19).to eql(log.uprn) + end + end + + context "when parsing csv without headers" do + let(:file) { Tempfile.new } + let(:path) { file.path } + let(:log) { build(:sales_log, :completed, :with_uprn) } + + before do + file.write(BulkUpload::SalesLogToCsv.new(log:, col_offset: 0).to_2023_csv_row) + file.rewind + end + + it "returns correct offsets" do + expect(service.row_offset).to eq(0) + expect(service.col_offset).to eq(0) + end + + it "parses csv correctly" do + expect(service.row_parsers[0].field_19).to eql(log.uprn) + end + end + + context "when parsing with BOM aka byte order mark" do + let(:file) { Tempfile.new } + let(:path) { file.path } + let(:log) { build(:sales_log, :completed, :with_uprn) } + let(:bom) { "\uFEFF" } + + before do + file.write(bom) + file.write(BulkUpload::SalesLogToCsv.new(log:, col_offset: 0).to_2023_csv_row) + file.close + end + + it "parses csv correctly" do + expect(service.row_parsers[0].field_19).to eql(log.uprn) + end + end + + context "when an invalid byte sequence" do + let(:file) { Tempfile.new } + let(:path) { file.path } + let(:log) { build(:sales_log, :completed, :with_uprn) } + let(:invalid_sequence) { "\x81" } + + before do + file.write(invalid_sequence) + file.write(BulkUpload::SalesLogToCsv.new(log:, col_offset: 0).to_2023_csv_row) + file.close + end + + it "parses csv correctly" do + expect(service.row_parsers[0].field_19).to eql(log.uprn) + end + end + + describe "#column_for_field", aggregate_failures: true do + context "when headers present" do + before do + file.write("Question\n") + file.write("Additional info\n") + file.write("Values\n") + file.write("Can be empty?\n") + file.write("Type of letting the question applies to\n") + file.write("Duplicate check field?\n") + file.write(BulkUpload::SalesLogToCsv.new(log:).default_2023_field_numbers_row) + file.write(BulkUpload::SalesLogToCsv.new(log:).to_2023_csv_row) + file.rewind + end + + it "returns correct column" do + expect(service.column_for_field("field_1")).to eql("CO") + expect(service.column_for_field("field_99")).to eql("EK") + end + end + + context "when no headers" do + let(:file) { Tempfile.new } + let(:path) { file.path } + let(:log) { build(:sales_log, :completed, :with_uprn) } + + before do + file.write(BulkUpload::SalesLogToCsv.new(log:, col_offset: 0).to_2023_csv_row) + file.rewind + end + + it "returns correct column" do + expect(service.column_for_field("field_1")).to eql("CN") + expect(service.column_for_field("field_99")).to eql("EJ") + end + end + end +end