diff --git a/sanitizer/_city_name_standardizer.js b/sanitizer/_city_name_standardizer.js new file mode 100644 index 00000000..3bcf817d --- /dev/null +++ b/sanitizer/_city_name_standardizer.js @@ -0,0 +1,47 @@ +const _ = require('lodash'); + +// matches 'ft', 'mt', 'saint', and 'sainte' on word boundary +const mountSaintFort = /\b([fm]t|sainte?)\b/g; + +const transliterations = { + 'mt': 'mount', + 'ft': 'fort', + 'saint': 'st', + 'sainte': 'ste' +}; + +function transliterate(match) { + return _.get(transliterations, match); +} + +// transliterate ft/mt/saint/sainte to fort/mount/st/ste, respectively +function sanitize(raw, clean) { + // error & warning messages + // this function doesn't add any error or warning messages + const messages = { errors: [], warnings: [] }; + + // only try to transliterate if there is a city in parsed_text + if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) { + // eg input: Ft. Saint Louis + // after 1. ft saint louis + // after 2. fort st louis + // after 3. fort st louis + + // 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier) + const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 '); + + // 2. transliterate 'saint'->'st', etc + const transliterated = periods_removed.replace(mountSaintFort, transliterate); + + // 3. reduce whitespace sequences that can occur when removing periods down to a single space + const whitespace_normalized = _.trimEnd(transliterated.replace(/\s+/, ' ')); + + clean.parsed_text.city = whitespace_normalized; + + } + + return messages; + +} + +module.exports = sanitize; diff --git a/sanitizer/search.js b/sanitizer/search.js index d99a926e..5694c9eb 100644 --- a/sanitizer/search.js +++ b/sanitizer/search.js @@ -6,6 +6,7 @@ var sanitizeAll = require('../sanitizer/sanitizeAll'), quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'), text: require('../sanitizer/_text'), iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'), + city_name_standardizer: require('../sanitizer/_city_name_standardizer'), size: require('../sanitizer/_size')(/* use defaults*/), layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping), sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping), diff --git a/sanitizer/structured_geocoding.js b/sanitizer/structured_geocoding.js index ebd55a56..29edce41 100644 --- a/sanitizer/structured_geocoding.js +++ b/sanitizer/structured_geocoding.js @@ -6,6 +6,7 @@ var sanitizeAll = require('../sanitizer/sanitizeAll'), quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'), synthesize_analysis: require('../sanitizer/_synthesize_analysis'), iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'), + city_name_standardizer: require('../sanitizer/_city_name_standardizer'), size: require('../sanitizer/_size')(/* use defaults*/), layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping), sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping), diff --git a/test/unit/run.js b/test/unit/run.js index a3109585..d21890e7 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -48,6 +48,7 @@ var tests = [ require('./sanitizer/_ids'), require('./sanitizer/_iso2_to_iso3'), require('./sanitizer/_layers'), + require('./sanitizer/_city_name_standardizer'), require('./sanitizer/_single_scalar_parameters'), require('./sanitizer/_size'), require('./sanitizer/_sources'), diff --git a/test/unit/sanitizer/_city_name_standardizer.js b/test/unit/sanitizer/_city_name_standardizer.js new file mode 100644 index 00000000..da362868 --- /dev/null +++ b/test/unit/sanitizer/_city_name_standardizer.js @@ -0,0 +1,285 @@ +const _ = require('lodash'); +const sanitizer = require('../../../sanitizer/_city_name_standardizer'); + +module.exports.tests = {}; + +module.exports.tests.text_parser = function(test, common) { + test('clean without parsed_text should not throw exception', function(t) { + const raw = {}; + + const clean = { + }; + + const expected_clean = { + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('undefined parsed_text.city should be unchanged', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + address: 'address value', + city: undefined + } + }; + + const expected_clean = { + parsed_text: { + address: 'address value', + city: undefined + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('\'saint\' should be abbreviated to \'st\' wherever it appears in the city', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + query: 'saint query value', + neighbourhood: 'saint neighbourhood value', + borough: 'saint borough value', + city: 'SainT city sAiNt value saInt', + county: 'saint county value', + state: 'saint state value', + postalcode: 'saint postalcode value', + country: 'saint country value' + } + }; + + const expected_clean = { + parsed_text: { + query: 'saint query value', + neighbourhood: 'saint neighbourhood value', + borough: 'saint borough value', + city: 'st city st value st', + county: 'saint county value', + state: 'saint state value', + postalcode: 'saint postalcode value', + country: 'saint country value' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('\'sainte\' should be abbreviated to \'ste\' wherever it appears in the city', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + query: 'sainte query value', + neighbourhood: 'sainte neighbourhood value', + borough: 'sainte borough value', + city: 'SaintE city sAinTe value saINte', + county: 'sainte county value', + state: 'sainte state value', + postalcode: 'sainte postalcode value', + country: 'sainte country value' + } + }; + + const expected_clean = { + parsed_text: { + query: 'sainte query value', + neighbourhood: 'sainte neighbourhood value', + borough: 'sainte borough value', + city: 'ste city ste value ste', + county: 'sainte county value', + state: 'sainte state value', + postalcode: 'sainte postalcode value', + country: 'sainte country value' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('\'ft\' should be expanded to \'fort\' wherever it appears in the city', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + query: 'ft query value', + neighbourhood: 'ft neighbourhood value', + borough: 'ft borough value', + city: 'Ft city ft value fT', + county: 'ft county value', + state: 'ft state value', + postalcode: 'ft postalcode value', + country: 'ft country value' + } + }; + + const expected_clean = { + parsed_text: { + query: 'ft query value', + neighbourhood: 'ft neighbourhood value', + borough: 'ft borough value', + city: 'fort city fort value fort', + county: 'ft county value', + state: 'ft state value', + postalcode: 'ft postalcode value', + country: 'ft country value' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('\'mt\' should be expanded to \'mount\' wherever it appears in the city', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + query: 'mt query value', + neighbourhood: 'mt neighbourhood value', + borough: 'mt borough value', + city: 'Mt city mt value mT', + county: 'mt county value', + state: 'mt state value', + postalcode: 'mt postalcode value', + country: 'mt country value' + } + }; + + const expected_clean = { + parsed_text: { + query: 'mt query value', + neighbourhood: 'mt neighbourhood value', + borough: 'mt borough value', + city: 'mount city mount value mount', + county: 'mt county value', + state: 'mt state value', + postalcode: 'mt postalcode value', + country: 'mt country value' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('mixture of \'mt\', \'ft\', \'saint\', and \'sainte\' should be expanded/abbreviated', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + city: 'mt. ft saint sainte mt ft.' + } + }; + + const expected_clean = { + parsed_text: { + city: 'mount fort st ste mount fort' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('period word boundary on \'mt.\' should replace with a space', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + city: 'mt.city' + } + }; + + const expected_clean = { + parsed_text: { + city: 'mount city' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('period word boundary on \'ft.\' should replace with a space', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + city: 'ft.city' + } + }; + + const expected_clean = { + parsed_text: { + city: 'fort city' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + +}; + +module.exports.all = function (tape, common) { + function test(name, testFunction) { + return tape('sanitizer _city_name_standardizer: ' + name, testFunction); + } + + for( const testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; diff --git a/test/unit/sanitizer/search.js b/test/unit/sanitizer/search.js index e2c01f8d..0d44103d 100644 --- a/test/unit/sanitizer/search.js +++ b/test/unit/sanitizer/search.js @@ -25,6 +25,10 @@ module.exports.tests.sanitize = function(test, common) { called_sanitizers.push('_iso2_to_iso3'); return { errors: [], warnings: [] }; }, + '../sanitizer/_city_name_standardizer': function() { + called_sanitizers.push('_city_name_standardizer'); + return { errors: [], warnings: [] }; + }, '../sanitizer/_size': function() { if (arguments.length === 0) { return function() { @@ -86,6 +90,7 @@ module.exports.tests.sanitize = function(test, common) { '_deprecate_quattroshapes', '_text', '_iso2_to_iso3', + '_city_name_standardizer', '_size', '_targets/layers', '_targets/sources', diff --git a/test/unit/sanitizer/structured_geocoding.js b/test/unit/sanitizer/structured_geocoding.js index ef9711b3..133be462 100644 --- a/test/unit/sanitizer/structured_geocoding.js +++ b/test/unit/sanitizer/structured_geocoding.js @@ -25,6 +25,10 @@ module.exports.tests.sanitize = function(test, common) { called_sanitizers.push('_iso2_to_iso3'); return { errors: [], warnings: [] }; }, + '../sanitizer/_city_name_standardizer': function() { + called_sanitizers.push('_city_name_standardizer'); + return { errors: [], warnings: [] }; + }, '../sanitizer/_size': function() { if (arguments.length === 0) { return function() { @@ -86,6 +90,7 @@ module.exports.tests.sanitize = function(test, common) { '_deprecate_quattroshapes', '_synthesize_analysis', '_iso2_to_iso3', + '_city_name_standardizer', '_size', '_targets/layers', '_targets/sources',