Browse Source

Merge pull request #773 from pelias/master

Merge master into staging
pull/774/head
Diana Shkolnikov 8 years ago committed by GitHub
parent
commit
51852026e8
  1. 25
      query/text_parser.js
  2. 47
      sanitizer/_city_name_standardizer.js
  3. 1
      sanitizer/search.js
  4. 1
      sanitizer/structured_geocoding.js
  5. 42
      test/unit/query/text_parser.js
  6. 1
      test/unit/run.js
  7. 285
      test/unit/sanitizer/_city_name_standardizer.js
  8. 5
      test/unit/sanitizer/search.js
  9. 5
      test/unit/sanitizer/structured_geocoding.js

25
query/text_parser.js

@ -1,67 +1,68 @@
var logger = require('pelias-logger').get('api'); var logger = require('pelias-logger').get('api');
var _ = require('lodash');
// all the address parsing logic // all the address parsing logic
function addParsedVariablesToQueryVariables( parsed_text, vs ){ function addParsedVariablesToQueryVariables( parsed_text, vs ){
// ==== add parsed matches [address components] ==== // ==== add parsed matches [address components] ====
// query - Mexitaly, Sunoco, Lowes // query - Mexitaly, Sunoco, Lowes
if (parsed_text.hasOwnProperty('query')) { if ( ! _.isEmpty(parsed_text.query) ) {
vs.var('input:query', parsed_text.query); vs.var('input:query', parsed_text.query);
} }
// categories - restaurants, hotels, bars // categories - restaurants, hotels, bars
if (parsed_text.hasOwnProperty('category')) { if ( ! _.isEmpty(parsed_text.category) ) {
vs.var('input:category', parsed_text.category); vs.var('input:category', parsed_text.category);
} }
if (parsed_text.hasOwnProperty('address')) { if ( ! _.isEmpty(parsed_text.address) ) {
vs.var( 'input:address', parsed_text.address ); vs.var( 'input:address', parsed_text.address );
} }
// house number // house number
if( parsed_text.hasOwnProperty('number') ){ if( ! _.isEmpty(parsed_text.number) ){
vs.var( 'input:housenumber', parsed_text.number ); vs.var( 'input:housenumber', parsed_text.number );
} }
// street name // street name
if( parsed_text.hasOwnProperty('street') ){ if( ! _.isEmpty(parsed_text.street) ){
vs.var( 'input:street', parsed_text.street ); vs.var( 'input:street', parsed_text.street );
} }
// neighbourhood // neighbourhood
if (parsed_text.hasOwnProperty('neighbourhood')) { if ( ! _.isEmpty(parsed_text.neighbourhood) ) {
vs.var( 'input:neighbourhood', parsed_text.neighbourhood); vs.var( 'input:neighbourhood', parsed_text.neighbourhood);
} }
// borough // borough
if (parsed_text.hasOwnProperty('borough')) { if ( ! _.isEmpty(parsed_text.borough) ) {
vs.var( 'input:borough', parsed_text.borough); vs.var( 'input:borough', parsed_text.borough);
} }
// postal code // postal code
if( parsed_text.hasOwnProperty('postalcode') ){ if( ! _.isEmpty(parsed_text.postalcode) ){
vs.var( 'input:postcode', parsed_text.postalcode ); vs.var( 'input:postcode', parsed_text.postalcode );
} }
// ==== add parsed matches [admin components] ==== // ==== add parsed matches [admin components] ====
// city // city
if( parsed_text.hasOwnProperty('city') ){ if( ! _.isEmpty(parsed_text.city) ){
vs.var( 'input:locality', parsed_text.city ); vs.var( 'input:locality', parsed_text.city );
} }
// county // county
if( parsed_text.hasOwnProperty('county') ){ if( ! _.isEmpty(parsed_text.county) ){
vs.var( 'input:county', parsed_text.county ); vs.var( 'input:county', parsed_text.county );
} }
// state // state
if( parsed_text.hasOwnProperty('state') ){ if( ! _.isEmpty(parsed_text.state) ){
vs.var( 'input:region', parsed_text.state ); vs.var( 'input:region', parsed_text.state );
} }
// country // country
if( parsed_text.hasOwnProperty('country') ){ if( ! _.isEmpty(parsed_text.country) ){
vs.var( 'input:country', parsed_text.country ); vs.var( 'input:country', parsed_text.country );
} }

47
sanitizer/_city_name_standardizer.js

@ -0,0 +1,47 @@
const _ = require('lodash');
// matches 'ft', 'mt', 'saint', and 'sainte' on word boundary
const mountSaintFort = /\b([fm]t|sainte?)\b/g;
const transliterations = {
'mt': 'mount',
'ft': 'fort',
'saint': 'st',
'sainte': 'ste'
};
function transliterate(match) {
return _.get(transliterations, match);
}
// transliterate ft/mt/saint/sainte to fort/mount/st/ste, respectively
function sanitize(raw, clean) {
// error & warning messages
// this function doesn't add any error or warning messages
const messages = { errors: [], warnings: [] };
// only try to transliterate if there is a city in parsed_text
if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) {
// eg input: Ft. Saint Louis
// after 1. ft saint louis
// after 2. fort st louis
// after 3. fort st louis
// 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier)
const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 ');
// 2. transliterate 'saint'->'st', etc
const transliterated = periods_removed.replace(mountSaintFort, transliterate);
// 3. reduce whitespace sequences that can occur when removing periods down to a single space
const whitespace_normalized = _.trimEnd(transliterated.replace(/\s+/, ' '));
clean.parsed_text.city = whitespace_normalized;
}
return messages;
}
module.exports = sanitize;

1
sanitizer/search.js

@ -6,6 +6,7 @@ var sanitizeAll = require('../sanitizer/sanitizeAll'),
quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'), quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'),
text: require('../sanitizer/_text'), text: require('../sanitizer/_text'),
iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'), iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'),
city_name_standardizer: require('../sanitizer/_city_name_standardizer'),
size: require('../sanitizer/_size')(/* use defaults*/), size: require('../sanitizer/_size')(/* use defaults*/),
layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping), layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping),
sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping), sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping),

1
sanitizer/structured_geocoding.js

@ -6,6 +6,7 @@ var sanitizeAll = require('../sanitizer/sanitizeAll'),
quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'), quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'),
synthesize_analysis: require('../sanitizer/_synthesize_analysis'), synthesize_analysis: require('../sanitizer/_synthesize_analysis'),
iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'), iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'),
city_name_standardizer: require('../sanitizer/_city_name_standardizer'),
size: require('../sanitizer/_size')(/* use defaults*/), size: require('../sanitizer/_size')(/* use defaults*/),
layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping), layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping),
sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping), sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping),

42
test/unit/query/text_parser.js

@ -157,6 +157,48 @@ module.exports.tests.housenumber_special_cases = function(test, common) {
}; };
module.exports.tests.empty_values = function(test, common) {
test('empty string values not set', function (t) {
var parsed_text = {
query: '',
category: '',
number: '',
street: '',
address: '',
neighbourhood: '',
borough: '',
postalcode: '',
city: '',
county: '',
state: '',
country: ''
};
var vs = new VariableStore();
function testIt() {
text_parser(parsed_text, vs);
}
t.doesNotThrow(testIt, 'exception should not be thrown');
t.false(vs.isset('input:query'));
t.false(vs.isset('input:category'));
t.false(vs.isset('input:housenumber'));
t.false(vs.isset('input:street'));
t.false(vs.isset('input:address'));
t.false(vs.isset('input:neighbourhood'));
t.false(vs.isset('input:borough'));
t.false(vs.isset('input:postcode'));
t.false(vs.isset('input:locality'));
t.false(vs.isset('input:county'));
t.false(vs.isset('input:region'));
t.false(vs.isset('input:country'));
t.end();
});
};
module.exports.all = function (tape, common) { module.exports.all = function (tape, common) {
function test(name, testFunction) { function test(name, testFunction) {
return tape('text_parser ' + name, testFunction); return tape('text_parser ' + name, testFunction);

1
test/unit/run.js

@ -48,6 +48,7 @@ var tests = [
require('./sanitizer/_ids'), require('./sanitizer/_ids'),
require('./sanitizer/_iso2_to_iso3'), require('./sanitizer/_iso2_to_iso3'),
require('./sanitizer/_layers'), require('./sanitizer/_layers'),
require('./sanitizer/_city_name_standardizer'),
require('./sanitizer/_single_scalar_parameters'), require('./sanitizer/_single_scalar_parameters'),
require('./sanitizer/_size'), require('./sanitizer/_size'),
require('./sanitizer/_sources'), require('./sanitizer/_sources'),

285
test/unit/sanitizer/_city_name_standardizer.js

@ -0,0 +1,285 @@
const _ = require('lodash');
const sanitizer = require('../../../sanitizer/_city_name_standardizer');
module.exports.tests = {};
module.exports.tests.text_parser = function(test, common) {
test('clean without parsed_text should not throw exception', function(t) {
const raw = {};
const clean = {
};
const expected_clean = {
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('undefined parsed_text.city should be unchanged', function(t) {
const raw = {};
const clean = {
parsed_text: {
address: 'address value',
city: undefined
}
};
const expected_clean = {
parsed_text: {
address: 'address value',
city: undefined
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('\'saint\' should be abbreviated to \'st\' wherever it appears in the city', function(t) {
const raw = {};
const clean = {
parsed_text: {
query: 'saint query value',
neighbourhood: 'saint neighbourhood value',
borough: 'saint borough value',
city: 'SainT city sAiNt value saInt',
county: 'saint county value',
state: 'saint state value',
postalcode: 'saint postalcode value',
country: 'saint country value'
}
};
const expected_clean = {
parsed_text: {
query: 'saint query value',
neighbourhood: 'saint neighbourhood value',
borough: 'saint borough value',
city: 'st city st value st',
county: 'saint county value',
state: 'saint state value',
postalcode: 'saint postalcode value',
country: 'saint country value'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('\'sainte\' should be abbreviated to \'ste\' wherever it appears in the city', function(t) {
const raw = {};
const clean = {
parsed_text: {
query: 'sainte query value',
neighbourhood: 'sainte neighbourhood value',
borough: 'sainte borough value',
city: 'SaintE city sAinTe value saINte',
county: 'sainte county value',
state: 'sainte state value',
postalcode: 'sainte postalcode value',
country: 'sainte country value'
}
};
const expected_clean = {
parsed_text: {
query: 'sainte query value',
neighbourhood: 'sainte neighbourhood value',
borough: 'sainte borough value',
city: 'ste city ste value ste',
county: 'sainte county value',
state: 'sainte state value',
postalcode: 'sainte postalcode value',
country: 'sainte country value'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('\'ft\' should be expanded to \'fort\' wherever it appears in the city', function(t) {
const raw = {};
const clean = {
parsed_text: {
query: 'ft query value',
neighbourhood: 'ft neighbourhood value',
borough: 'ft borough value',
city: 'Ft city ft value fT',
county: 'ft county value',
state: 'ft state value',
postalcode: 'ft postalcode value',
country: 'ft country value'
}
};
const expected_clean = {
parsed_text: {
query: 'ft query value',
neighbourhood: 'ft neighbourhood value',
borough: 'ft borough value',
city: 'fort city fort value fort',
county: 'ft county value',
state: 'ft state value',
postalcode: 'ft postalcode value',
country: 'ft country value'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('\'mt\' should be expanded to \'mount\' wherever it appears in the city', function(t) {
const raw = {};
const clean = {
parsed_text: {
query: 'mt query value',
neighbourhood: 'mt neighbourhood value',
borough: 'mt borough value',
city: 'Mt city mt value mT',
county: 'mt county value',
state: 'mt state value',
postalcode: 'mt postalcode value',
country: 'mt country value'
}
};
const expected_clean = {
parsed_text: {
query: 'mt query value',
neighbourhood: 'mt neighbourhood value',
borough: 'mt borough value',
city: 'mount city mount value mount',
county: 'mt county value',
state: 'mt state value',
postalcode: 'mt postalcode value',
country: 'mt country value'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('mixture of \'mt\', \'ft\', \'saint\', and \'sainte\' should be expanded/abbreviated', function(t) {
const raw = {};
const clean = {
parsed_text: {
city: 'mt. ft saint sainte mt ft.'
}
};
const expected_clean = {
parsed_text: {
city: 'mount fort st ste mount fort'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('period word boundary on \'mt.\' should replace with a space', function(t) {
const raw = {};
const clean = {
parsed_text: {
city: 'mt.city'
}
};
const expected_clean = {
parsed_text: {
city: 'mount city'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('period word boundary on \'ft.\' should replace with a space', function(t) {
const raw = {};
const clean = {
parsed_text: {
city: 'ft.city'
}
};
const expected_clean = {
parsed_text: {
city: 'fort city'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('sanitizer _city_name_standardizer: ' + name, testFunction);
}
for( const testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};

5
test/unit/sanitizer/search.js

@ -25,6 +25,10 @@ module.exports.tests.sanitize = function(test, common) {
called_sanitizers.push('_iso2_to_iso3'); called_sanitizers.push('_iso2_to_iso3');
return { errors: [], warnings: [] }; return { errors: [], warnings: [] };
}, },
'../sanitizer/_city_name_standardizer': function() {
called_sanitizers.push('_city_name_standardizer');
return { errors: [], warnings: [] };
},
'../sanitizer/_size': function() { '../sanitizer/_size': function() {
if (arguments.length === 0) { if (arguments.length === 0) {
return function() { return function() {
@ -86,6 +90,7 @@ module.exports.tests.sanitize = function(test, common) {
'_deprecate_quattroshapes', '_deprecate_quattroshapes',
'_text', '_text',
'_iso2_to_iso3', '_iso2_to_iso3',
'_city_name_standardizer',
'_size', '_size',
'_targets/layers', '_targets/layers',
'_targets/sources', '_targets/sources',

5
test/unit/sanitizer/structured_geocoding.js

@ -25,6 +25,10 @@ module.exports.tests.sanitize = function(test, common) {
called_sanitizers.push('_iso2_to_iso3'); called_sanitizers.push('_iso2_to_iso3');
return { errors: [], warnings: [] }; return { errors: [], warnings: [] };
}, },
'../sanitizer/_city_name_standardizer': function() {
called_sanitizers.push('_city_name_standardizer');
return { errors: [], warnings: [] };
},
'../sanitizer/_size': function() { '../sanitizer/_size': function() {
if (arguments.length === 0) { if (arguments.length === 0) {
return function() { return function() {
@ -86,6 +90,7 @@ module.exports.tests.sanitize = function(test, common) {
'_deprecate_quattroshapes', '_deprecate_quattroshapes',
'_synthesize_analysis', '_synthesize_analysis',
'_iso2_to_iso3', '_iso2_to_iso3',
'_city_name_standardizer',
'_size', '_size',
'_targets/layers', '_targets/layers',
'_targets/sources', '_targets/sources',

Loading…
Cancel
Save