diff --git a/sanitizer/_synthesize_analysis.js b/sanitizer/_synthesize_analysis.js index 79c53187..ab18b581 100644 --- a/sanitizer/_synthesize_analysis.js +++ b/sanitizer/_synthesize_analysis.js @@ -21,6 +21,10 @@ function isPostalCodeOnly(parsed_text) { parsed_text.hasOwnProperty('postalcode'); } +// figure out which field contains the probable house number, prefer number +// libpostal parses some inputs, like `3370 cobbe ave`, as a postcode+street +// so because we're treating the entire field as a street address, it's safe +// to assume that an identified postcode is actually a house number. function getHouseNumberField(analyzed_address) { for (var field of ['number', 'postalcode']) { if (analyzed_address.hasOwnProperty(field)) { @@ -54,21 +58,31 @@ function sanitize( raw, clean ){ } if (clean.parsed_text.hasOwnProperty('address')) { - var analyzed_address = text_analyzer.parse(clean.parsed_text.address); + const analyzed_address = text_analyzer.parse(clean.parsed_text.address); const house_number_field = getHouseNumberField(analyzed_address); + // if we're fairly certain that libpostal identified a house number + // (from either the house_number or postcode field), place it into the + // number field and remove the first instance of that value from address + // and assign to street + // eg - '1090 N Charlotte St' becomes number=1090 and street=N Charlotte St if (house_number_field) { clean.parsed_text.number = analyzed_address[house_number_field]; + // remove the first instance of the number and trim whitespace clean.parsed_text.street = _.trim(_.replace(clean.parsed_text.address, clean.parsed_text.number, '')); - delete clean.parsed_text.address; } else { + // otherwise no house number was identifiable, so treat the entire input + // as a street clean.parsed_text.street = clean.parsed_text.address; - delete clean.parsed_text.address; + } + // the address field no longer means anything since it's been parsed, so remove it + delete clean.parsed_text.address; + } return messages;