diff --git a/query/text_parser.js b/query/text_parser.js index 4ccc8661..fd07cd0d 100644 --- a/query/text_parser.js +++ b/query/text_parser.js @@ -1,67 +1,68 @@ var logger = require('pelias-logger').get('api'); +var _ = require('lodash'); // all the address parsing logic function addParsedVariablesToQueryVariables( parsed_text, vs ){ // ==== add parsed matches [address components] ==== // query - Mexitaly, Sunoco, Lowes - if (parsed_text.hasOwnProperty('query')) { + if ( ! _.isEmpty(parsed_text.query) ) { vs.var('input:query', parsed_text.query); } // categories - restaurants, hotels, bars - if (parsed_text.hasOwnProperty('category')) { + if ( ! _.isEmpty(parsed_text.category) ) { vs.var('input:category', parsed_text.category); } - if (parsed_text.hasOwnProperty('address')) { + if ( ! _.isEmpty(parsed_text.address) ) { vs.var( 'input:address', parsed_text.address ); } // house number - if( parsed_text.hasOwnProperty('number') ){ + if( ! _.isEmpty(parsed_text.number) ){ vs.var( 'input:housenumber', parsed_text.number ); } // street name - if( parsed_text.hasOwnProperty('street') ){ + if( ! _.isEmpty(parsed_text.street) ){ vs.var( 'input:street', parsed_text.street ); } // neighbourhood - if (parsed_text.hasOwnProperty('neighbourhood')) { + if ( ! _.isEmpty(parsed_text.neighbourhood) ) { vs.var( 'input:neighbourhood', parsed_text.neighbourhood); } // borough - if (parsed_text.hasOwnProperty('borough')) { + if ( ! _.isEmpty(parsed_text.borough) ) { vs.var( 'input:borough', parsed_text.borough); } // postal code - if( parsed_text.hasOwnProperty('postalcode') ){ + if( ! _.isEmpty(parsed_text.postalcode) ){ vs.var( 'input:postcode', parsed_text.postalcode ); } // ==== add parsed matches [admin components] ==== // city - if( parsed_text.hasOwnProperty('city') ){ + if( ! _.isEmpty(parsed_text.city) ){ vs.var( 'input:locality', parsed_text.city ); } // county - if( parsed_text.hasOwnProperty('county') ){ + if( ! _.isEmpty(parsed_text.county) ){ vs.var( 'input:county', parsed_text.county ); } // state - if( parsed_text.hasOwnProperty('state') ){ + if( ! _.isEmpty(parsed_text.state) ){ vs.var( 'input:region', parsed_text.state ); } // country - if( parsed_text.hasOwnProperty('country') ){ + if( ! _.isEmpty(parsed_text.country) ){ vs.var( 'input:country', parsed_text.country ); } diff --git a/sanitizer/_city_name_standardizer.js b/sanitizer/_city_name_standardizer.js new file mode 100644 index 00000000..3bcf817d --- /dev/null +++ b/sanitizer/_city_name_standardizer.js @@ -0,0 +1,47 @@ +const _ = require('lodash'); + +// matches 'ft', 'mt', 'saint', and 'sainte' on word boundary +const mountSaintFort = /\b([fm]t|sainte?)\b/g; + +const transliterations = { + 'mt': 'mount', + 'ft': 'fort', + 'saint': 'st', + 'sainte': 'ste' +}; + +function transliterate(match) { + return _.get(transliterations, match); +} + +// transliterate ft/mt/saint/sainte to fort/mount/st/ste, respectively +function sanitize(raw, clean) { + // error & warning messages + // this function doesn't add any error or warning messages + const messages = { errors: [], warnings: [] }; + + // only try to transliterate if there is a city in parsed_text + if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) { + // eg input: Ft. Saint Louis + // after 1. ft saint louis + // after 2. fort st louis + // after 3. fort st louis + + // 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier) + const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 '); + + // 2. transliterate 'saint'->'st', etc + const transliterated = periods_removed.replace(mountSaintFort, transliterate); + + // 3. reduce whitespace sequences that can occur when removing periods down to a single space + const whitespace_normalized = _.trimEnd(transliterated.replace(/\s+/, ' ')); + + clean.parsed_text.city = whitespace_normalized; + + } + + return messages; + +} + +module.exports = sanitize; diff --git a/sanitizer/search.js b/sanitizer/search.js index d99a926e..5694c9eb 100644 --- a/sanitizer/search.js +++ b/sanitizer/search.js @@ -6,6 +6,7 @@ var sanitizeAll = require('../sanitizer/sanitizeAll'), quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'), text: require('../sanitizer/_text'), iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'), + city_name_standardizer: require('../sanitizer/_city_name_standardizer'), size: require('../sanitizer/_size')(/* use defaults*/), layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping), sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping), diff --git a/sanitizer/structured_geocoding.js b/sanitizer/structured_geocoding.js index ebd55a56..29edce41 100644 --- a/sanitizer/structured_geocoding.js +++ b/sanitizer/structured_geocoding.js @@ -6,6 +6,7 @@ var sanitizeAll = require('../sanitizer/sanitizeAll'), quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'), synthesize_analysis: require('../sanitizer/_synthesize_analysis'), iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'), + city_name_standardizer: require('../sanitizer/_city_name_standardizer'), size: require('../sanitizer/_size')(/* use defaults*/), layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping), sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping), diff --git a/test/unit/query/text_parser.js b/test/unit/query/text_parser.js index 839ddfbd..86d443fc 100644 --- a/test/unit/query/text_parser.js +++ b/test/unit/query/text_parser.js @@ -157,6 +157,48 @@ module.exports.tests.housenumber_special_cases = function(test, common) { }; +module.exports.tests.empty_values = function(test, common) { + test('empty string values not set', function (t) { + var parsed_text = { + query: '', + category: '', + number: '', + street: '', + address: '', + neighbourhood: '', + borough: '', + postalcode: '', + city: '', + county: '', + state: '', + country: '' + }; + var vs = new VariableStore(); + + function testIt() { + text_parser(parsed_text, vs); + } + + t.doesNotThrow(testIt, 'exception should not be thrown'); + + t.false(vs.isset('input:query')); + t.false(vs.isset('input:category')); + t.false(vs.isset('input:housenumber')); + t.false(vs.isset('input:street')); + t.false(vs.isset('input:address')); + t.false(vs.isset('input:neighbourhood')); + t.false(vs.isset('input:borough')); + t.false(vs.isset('input:postcode')); + t.false(vs.isset('input:locality')); + t.false(vs.isset('input:county')); + t.false(vs.isset('input:region')); + t.false(vs.isset('input:country')); + t.end(); + + }); + +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { return tape('text_parser ' + name, testFunction); diff --git a/test/unit/run.js b/test/unit/run.js index a3109585..d21890e7 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -48,6 +48,7 @@ var tests = [ require('./sanitizer/_ids'), require('./sanitizer/_iso2_to_iso3'), require('./sanitizer/_layers'), + require('./sanitizer/_city_name_standardizer'), require('./sanitizer/_single_scalar_parameters'), require('./sanitizer/_size'), require('./sanitizer/_sources'), diff --git a/test/unit/sanitizer/_city_name_standardizer.js b/test/unit/sanitizer/_city_name_standardizer.js new file mode 100644 index 00000000..da362868 --- /dev/null +++ b/test/unit/sanitizer/_city_name_standardizer.js @@ -0,0 +1,285 @@ +const _ = require('lodash'); +const sanitizer = require('../../../sanitizer/_city_name_standardizer'); + +module.exports.tests = {}; + +module.exports.tests.text_parser = function(test, common) { + test('clean without parsed_text should not throw exception', function(t) { + const raw = {}; + + const clean = { + }; + + const expected_clean = { + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('undefined parsed_text.city should be unchanged', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + address: 'address value', + city: undefined + } + }; + + const expected_clean = { + parsed_text: { + address: 'address value', + city: undefined + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('\'saint\' should be abbreviated to \'st\' wherever it appears in the city', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + query: 'saint query value', + neighbourhood: 'saint neighbourhood value', + borough: 'saint borough value', + city: 'SainT city sAiNt value saInt', + county: 'saint county value', + state: 'saint state value', + postalcode: 'saint postalcode value', + country: 'saint country value' + } + }; + + const expected_clean = { + parsed_text: { + query: 'saint query value', + neighbourhood: 'saint neighbourhood value', + borough: 'saint borough value', + city: 'st city st value st', + county: 'saint county value', + state: 'saint state value', + postalcode: 'saint postalcode value', + country: 'saint country value' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('\'sainte\' should be abbreviated to \'ste\' wherever it appears in the city', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + query: 'sainte query value', + neighbourhood: 'sainte neighbourhood value', + borough: 'sainte borough value', + city: 'SaintE city sAinTe value saINte', + county: 'sainte county value', + state: 'sainte state value', + postalcode: 'sainte postalcode value', + country: 'sainte country value' + } + }; + + const expected_clean = { + parsed_text: { + query: 'sainte query value', + neighbourhood: 'sainte neighbourhood value', + borough: 'sainte borough value', + city: 'ste city ste value ste', + county: 'sainte county value', + state: 'sainte state value', + postalcode: 'sainte postalcode value', + country: 'sainte country value' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('\'ft\' should be expanded to \'fort\' wherever it appears in the city', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + query: 'ft query value', + neighbourhood: 'ft neighbourhood value', + borough: 'ft borough value', + city: 'Ft city ft value fT', + county: 'ft county value', + state: 'ft state value', + postalcode: 'ft postalcode value', + country: 'ft country value' + } + }; + + const expected_clean = { + parsed_text: { + query: 'ft query value', + neighbourhood: 'ft neighbourhood value', + borough: 'ft borough value', + city: 'fort city fort value fort', + county: 'ft county value', + state: 'ft state value', + postalcode: 'ft postalcode value', + country: 'ft country value' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('\'mt\' should be expanded to \'mount\' wherever it appears in the city', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + query: 'mt query value', + neighbourhood: 'mt neighbourhood value', + borough: 'mt borough value', + city: 'Mt city mt value mT', + county: 'mt county value', + state: 'mt state value', + postalcode: 'mt postalcode value', + country: 'mt country value' + } + }; + + const expected_clean = { + parsed_text: { + query: 'mt query value', + neighbourhood: 'mt neighbourhood value', + borough: 'mt borough value', + city: 'mount city mount value mount', + county: 'mt county value', + state: 'mt state value', + postalcode: 'mt postalcode value', + country: 'mt country value' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('mixture of \'mt\', \'ft\', \'saint\', and \'sainte\' should be expanded/abbreviated', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + city: 'mt. ft saint sainte mt ft.' + } + }; + + const expected_clean = { + parsed_text: { + city: 'mount fort st ste mount fort' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('period word boundary on \'mt.\' should replace with a space', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + city: 'mt.city' + } + }; + + const expected_clean = { + parsed_text: { + city: 'mount city' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('period word boundary on \'ft.\' should replace with a space', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + city: 'ft.city' + } + }; + + const expected_clean = { + parsed_text: { + city: 'fort city' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + +}; + +module.exports.all = function (tape, common) { + function test(name, testFunction) { + return tape('sanitizer _city_name_standardizer: ' + name, testFunction); + } + + for( const testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; diff --git a/test/unit/sanitizer/search.js b/test/unit/sanitizer/search.js index e2c01f8d..0d44103d 100644 --- a/test/unit/sanitizer/search.js +++ b/test/unit/sanitizer/search.js @@ -25,6 +25,10 @@ module.exports.tests.sanitize = function(test, common) { called_sanitizers.push('_iso2_to_iso3'); return { errors: [], warnings: [] }; }, + '../sanitizer/_city_name_standardizer': function() { + called_sanitizers.push('_city_name_standardizer'); + return { errors: [], warnings: [] }; + }, '../sanitizer/_size': function() { if (arguments.length === 0) { return function() { @@ -86,6 +90,7 @@ module.exports.tests.sanitize = function(test, common) { '_deprecate_quattroshapes', '_text', '_iso2_to_iso3', + '_city_name_standardizer', '_size', '_targets/layers', '_targets/sources', diff --git a/test/unit/sanitizer/structured_geocoding.js b/test/unit/sanitizer/structured_geocoding.js index ef9711b3..133be462 100644 --- a/test/unit/sanitizer/structured_geocoding.js +++ b/test/unit/sanitizer/structured_geocoding.js @@ -25,6 +25,10 @@ module.exports.tests.sanitize = function(test, common) { called_sanitizers.push('_iso2_to_iso3'); return { errors: [], warnings: [] }; }, + '../sanitizer/_city_name_standardizer': function() { + called_sanitizers.push('_city_name_standardizer'); + return { errors: [], warnings: [] }; + }, '../sanitizer/_size': function() { if (arguments.length === 0) { return function() { @@ -86,6 +90,7 @@ module.exports.tests.sanitize = function(test, common) { '_deprecate_quattroshapes', '_synthesize_analysis', '_iso2_to_iso3', + '_city_name_standardizer', '_size', '_targets/layers', '_targets/sources',