Browse Source

Merge pull request #767 from pelias/add-mt-st-ft-normalizing

Add Mt/St/Ft normalizing
pull/770/head
Stephen K Hess 8 years ago committed by GitHub
parent
commit
e99da45b1d
  1. 47
      sanitizer/_city_name_standardizer.js
  2. 1
      sanitizer/search.js
  3. 1
      sanitizer/structured_geocoding.js
  4. 1
      test/unit/run.js
  5. 285
      test/unit/sanitizer/_city_name_standardizer.js
  6. 5
      test/unit/sanitizer/search.js
  7. 5
      test/unit/sanitizer/structured_geocoding.js

47
sanitizer/_city_name_standardizer.js

@ -0,0 +1,47 @@
const _ = require('lodash');
// matches 'ft', 'mt', 'saint', and 'sainte' on word boundary
const mountSaintFort = /\b([fm]t|sainte?)\b/g;
const transliterations = {
'mt': 'mount',
'ft': 'fort',
'saint': 'st',
'sainte': 'ste'
};
function transliterate(match) {
return _.get(transliterations, match);
}
// transliterate ft/mt/saint/sainte to fort/mount/st/ste, respectively
function sanitize(raw, clean) {
// error & warning messages
// this function doesn't add any error or warning messages
const messages = { errors: [], warnings: [] };
// only try to transliterate if there is a city in parsed_text
if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) {
// eg input: Ft. Saint Louis
// after 1. ft saint louis
// after 2. fort st louis
// after 3. fort st louis
// 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier)
const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 ');
// 2. transliterate 'saint'->'st', etc
const transliterated = periods_removed.replace(mountSaintFort, transliterate);
// 3. reduce whitespace sequences that can occur when removing periods down to a single space
const whitespace_normalized = _.trimEnd(transliterated.replace(/\s+/, ' '));
clean.parsed_text.city = whitespace_normalized;
}
return messages;
}
module.exports = sanitize;

1
sanitizer/search.js

@ -6,6 +6,7 @@ var sanitizeAll = require('../sanitizer/sanitizeAll'),
quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'), quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'),
text: require('../sanitizer/_text'), text: require('../sanitizer/_text'),
iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'), iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'),
city_name_standardizer: require('../sanitizer/_city_name_standardizer'),
size: require('../sanitizer/_size')(/* use defaults*/), size: require('../sanitizer/_size')(/* use defaults*/),
layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping), layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping),
sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping), sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping),

1
sanitizer/structured_geocoding.js

@ -6,6 +6,7 @@ var sanitizeAll = require('../sanitizer/sanitizeAll'),
quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'), quattroshapes_deprecation: require('../sanitizer/_deprecate_quattroshapes'),
synthesize_analysis: require('../sanitizer/_synthesize_analysis'), synthesize_analysis: require('../sanitizer/_synthesize_analysis'),
iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'), iso2_to_iso3: require('../sanitizer/_iso2_to_iso3'),
city_name_standardizer: require('../sanitizer/_city_name_standardizer'),
size: require('../sanitizer/_size')(/* use defaults*/), size: require('../sanitizer/_size')(/* use defaults*/),
layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping), layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping),
sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping), sources: require('../sanitizer/_targets')('sources', type_mapping.source_mapping),

1
test/unit/run.js

@ -48,6 +48,7 @@ var tests = [
require('./sanitizer/_ids'), require('./sanitizer/_ids'),
require('./sanitizer/_iso2_to_iso3'), require('./sanitizer/_iso2_to_iso3'),
require('./sanitizer/_layers'), require('./sanitizer/_layers'),
require('./sanitizer/_city_name_standardizer'),
require('./sanitizer/_single_scalar_parameters'), require('./sanitizer/_single_scalar_parameters'),
require('./sanitizer/_size'), require('./sanitizer/_size'),
require('./sanitizer/_sources'), require('./sanitizer/_sources'),

285
test/unit/sanitizer/_city_name_standardizer.js

@ -0,0 +1,285 @@
const _ = require('lodash');
const sanitizer = require('../../../sanitizer/_city_name_standardizer');
module.exports.tests = {};
module.exports.tests.text_parser = function(test, common) {
test('clean without parsed_text should not throw exception', function(t) {
const raw = {};
const clean = {
};
const expected_clean = {
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('undefined parsed_text.city should be unchanged', function(t) {
const raw = {};
const clean = {
parsed_text: {
address: 'address value',
city: undefined
}
};
const expected_clean = {
parsed_text: {
address: 'address value',
city: undefined
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('\'saint\' should be abbreviated to \'st\' wherever it appears in the city', function(t) {
const raw = {};
const clean = {
parsed_text: {
query: 'saint query value',
neighbourhood: 'saint neighbourhood value',
borough: 'saint borough value',
city: 'SainT city sAiNt value saInt',
county: 'saint county value',
state: 'saint state value',
postalcode: 'saint postalcode value',
country: 'saint country value'
}
};
const expected_clean = {
parsed_text: {
query: 'saint query value',
neighbourhood: 'saint neighbourhood value',
borough: 'saint borough value',
city: 'st city st value st',
county: 'saint county value',
state: 'saint state value',
postalcode: 'saint postalcode value',
country: 'saint country value'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('\'sainte\' should be abbreviated to \'ste\' wherever it appears in the city', function(t) {
const raw = {};
const clean = {
parsed_text: {
query: 'sainte query value',
neighbourhood: 'sainte neighbourhood value',
borough: 'sainte borough value',
city: 'SaintE city sAinTe value saINte',
county: 'sainte county value',
state: 'sainte state value',
postalcode: 'sainte postalcode value',
country: 'sainte country value'
}
};
const expected_clean = {
parsed_text: {
query: 'sainte query value',
neighbourhood: 'sainte neighbourhood value',
borough: 'sainte borough value',
city: 'ste city ste value ste',
county: 'sainte county value',
state: 'sainte state value',
postalcode: 'sainte postalcode value',
country: 'sainte country value'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('\'ft\' should be expanded to \'fort\' wherever it appears in the city', function(t) {
const raw = {};
const clean = {
parsed_text: {
query: 'ft query value',
neighbourhood: 'ft neighbourhood value',
borough: 'ft borough value',
city: 'Ft city ft value fT',
county: 'ft county value',
state: 'ft state value',
postalcode: 'ft postalcode value',
country: 'ft country value'
}
};
const expected_clean = {
parsed_text: {
query: 'ft query value',
neighbourhood: 'ft neighbourhood value',
borough: 'ft borough value',
city: 'fort city fort value fort',
county: 'ft county value',
state: 'ft state value',
postalcode: 'ft postalcode value',
country: 'ft country value'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('\'mt\' should be expanded to \'mount\' wherever it appears in the city', function(t) {
const raw = {};
const clean = {
parsed_text: {
query: 'mt query value',
neighbourhood: 'mt neighbourhood value',
borough: 'mt borough value',
city: 'Mt city mt value mT',
county: 'mt county value',
state: 'mt state value',
postalcode: 'mt postalcode value',
country: 'mt country value'
}
};
const expected_clean = {
parsed_text: {
query: 'mt query value',
neighbourhood: 'mt neighbourhood value',
borough: 'mt borough value',
city: 'mount city mount value mount',
county: 'mt county value',
state: 'mt state value',
postalcode: 'mt postalcode value',
country: 'mt country value'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('mixture of \'mt\', \'ft\', \'saint\', and \'sainte\' should be expanded/abbreviated', function(t) {
const raw = {};
const clean = {
parsed_text: {
city: 'mt. ft saint sainte mt ft.'
}
};
const expected_clean = {
parsed_text: {
city: 'mount fort st ste mount fort'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('period word boundary on \'mt.\' should replace with a space', function(t) {
const raw = {};
const clean = {
parsed_text: {
city: 'mt.city'
}
};
const expected_clean = {
parsed_text: {
city: 'mount city'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('period word boundary on \'ft.\' should replace with a space', function(t) {
const raw = {};
const clean = {
parsed_text: {
city: 'ft.city'
}
};
const expected_clean = {
parsed_text: {
city: 'fort city'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('sanitizer _city_name_standardizer: ' + name, testFunction);
}
for( const testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};

5
test/unit/sanitizer/search.js

@ -25,6 +25,10 @@ module.exports.tests.sanitize = function(test, common) {
called_sanitizers.push('_iso2_to_iso3'); called_sanitizers.push('_iso2_to_iso3');
return { errors: [], warnings: [] }; return { errors: [], warnings: [] };
}, },
'../sanitizer/_city_name_standardizer': function() {
called_sanitizers.push('_city_name_standardizer');
return { errors: [], warnings: [] };
},
'../sanitizer/_size': function() { '../sanitizer/_size': function() {
if (arguments.length === 0) { if (arguments.length === 0) {
return function() { return function() {
@ -86,6 +90,7 @@ module.exports.tests.sanitize = function(test, common) {
'_deprecate_quattroshapes', '_deprecate_quattroshapes',
'_text', '_text',
'_iso2_to_iso3', '_iso2_to_iso3',
'_city_name_standardizer',
'_size', '_size',
'_targets/layers', '_targets/layers',
'_targets/sources', '_targets/sources',

5
test/unit/sanitizer/structured_geocoding.js

@ -25,6 +25,10 @@ module.exports.tests.sanitize = function(test, common) {
called_sanitizers.push('_iso2_to_iso3'); called_sanitizers.push('_iso2_to_iso3');
return { errors: [], warnings: [] }; return { errors: [], warnings: [] };
}, },
'../sanitizer/_city_name_standardizer': function() {
called_sanitizers.push('_city_name_standardizer');
return { errors: [], warnings: [] };
},
'../sanitizer/_size': function() { '../sanitizer/_size': function() {
if (arguments.length === 0) { if (arguments.length === 0) {
return function() { return function() {
@ -86,6 +90,7 @@ module.exports.tests.sanitize = function(test, common) {
'_deprecate_quattroshapes', '_deprecate_quattroshapes',
'_synthesize_analysis', '_synthesize_analysis',
'_iso2_to_iso3', '_iso2_to_iso3',
'_city_name_standardizer',
'_size', '_size',
'_targets/layers', '_targets/layers',
'_targets/sources', '_targets/sources',

Loading…
Cancel
Save