mirror of https://github.com/pelias/api.git
Diana Shkolnikov
8 years ago
committed by
GitHub
9 changed files with 400 additions and 12 deletions
@ -0,0 +1,47 @@
|
||||
const _ = require('lodash'); |
||||
|
||||
// matches 'ft', 'mt', 'saint', and 'sainte' on word boundary
|
||||
const mountSaintFort = /\b([fm]t|sainte?)\b/g; |
||||
|
||||
const transliterations = { |
||||
'mt': 'mount', |
||||
'ft': 'fort', |
||||
'saint': 'st', |
||||
'sainte': 'ste' |
||||
}; |
||||
|
||||
function transliterate(match) { |
||||
return _.get(transliterations, match); |
||||
} |
||||
|
||||
// transliterate ft/mt/saint/sainte to fort/mount/st/ste, respectively
|
||||
function sanitize(raw, clean) { |
||||
// error & warning messages
|
||||
// this function doesn't add any error or warning messages
|
||||
const messages = { errors: [], warnings: [] }; |
||||
|
||||
// only try to transliterate if there is a city in parsed_text
|
||||
if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) { |
||||
// eg input: Ft. Saint Louis
|
||||
// after 1. ft saint louis
|
||||
// after 2. fort st louis
|
||||
// after 3. fort st louis
|
||||
|
||||
// 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier)
|
||||
const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 '); |
||||
|
||||
// 2. transliterate 'saint'->'st', etc
|
||||
const transliterated = periods_removed.replace(mountSaintFort, transliterate); |
||||
|
||||
// 3. reduce whitespace sequences that can occur when removing periods down to a single space
|
||||
const whitespace_normalized = _.trimEnd(transliterated.replace(/\s+/, ' ')); |
||||
|
||||
clean.parsed_text.city = whitespace_normalized; |
||||
|
||||
} |
||||
|
||||
return messages; |
||||
|
||||
} |
||||
|
||||
module.exports = sanitize; |
@ -0,0 +1,285 @@
|
||||
const _ = require('lodash'); |
||||
const sanitizer = require('../../../sanitizer/_city_name_standardizer'); |
||||
|
||||
module.exports.tests = {}; |
||||
|
||||
module.exports.tests.text_parser = function(test, common) { |
||||
test('clean without parsed_text should not throw exception', function(t) { |
||||
const raw = {}; |
||||
|
||||
const clean = { |
||||
}; |
||||
|
||||
const expected_clean = { |
||||
}; |
||||
|
||||
const messages = sanitizer(raw, clean); |
||||
|
||||
t.deepEquals(clean, expected_clean); |
||||
t.deepEquals(messages.errors, [], 'no errors'); |
||||
t.deepEquals(messages.warnings, [], 'no warnings'); |
||||
t.end(); |
||||
|
||||
}); |
||||
|
||||
test('undefined parsed_text.city should be unchanged', function(t) { |
||||
const raw = {}; |
||||
|
||||
const clean = { |
||||
parsed_text: { |
||||
address: 'address value', |
||||
city: undefined |
||||
} |
||||
}; |
||||
|
||||
const expected_clean = { |
||||
parsed_text: { |
||||
address: 'address value', |
||||
city: undefined |
||||
} |
||||
}; |
||||
|
||||
const messages = sanitizer(raw, clean); |
||||
|
||||
t.deepEquals(clean, expected_clean); |
||||
t.deepEquals(messages.errors, [], 'no errors'); |
||||
t.deepEquals(messages.warnings, [], 'no warnings'); |
||||
t.end(); |
||||
|
||||
}); |
||||
|
||||
test('\'saint\' should be abbreviated to \'st\' wherever it appears in the city', function(t) { |
||||
const raw = {}; |
||||
|
||||
const clean = { |
||||
parsed_text: { |
||||
query: 'saint query value', |
||||
neighbourhood: 'saint neighbourhood value', |
||||
borough: 'saint borough value', |
||||
city: 'SainT city sAiNt value saInt', |
||||
county: 'saint county value', |
||||
state: 'saint state value', |
||||
postalcode: 'saint postalcode value', |
||||
country: 'saint country value' |
||||
} |
||||
}; |
||||
|
||||
const expected_clean = { |
||||
parsed_text: { |
||||
query: 'saint query value', |
||||
neighbourhood: 'saint neighbourhood value', |
||||
borough: 'saint borough value', |
||||
city: 'st city st value st', |
||||
county: 'saint county value', |
||||
state: 'saint state value', |
||||
postalcode: 'saint postalcode value', |
||||
country: 'saint country value' |
||||
} |
||||
}; |
||||
|
||||
const messages = sanitizer(raw, clean); |
||||
|
||||
t.deepEquals(clean, expected_clean); |
||||
t.deepEquals(messages.errors, [], 'no errors'); |
||||
t.deepEquals(messages.warnings, [], 'no warnings'); |
||||
t.end(); |
||||
|
||||
}); |
||||
|
||||
test('\'sainte\' should be abbreviated to \'ste\' wherever it appears in the city', function(t) { |
||||
const raw = {}; |
||||
|
||||
const clean = { |
||||
parsed_text: { |
||||
query: 'sainte query value', |
||||
neighbourhood: 'sainte neighbourhood value', |
||||
borough: 'sainte borough value', |
||||
city: 'SaintE city sAinTe value saINte', |
||||
county: 'sainte county value', |
||||
state: 'sainte state value', |
||||
postalcode: 'sainte postalcode value', |
||||
country: 'sainte country value' |
||||
} |
||||
}; |
||||
|
||||
const expected_clean = { |
||||
parsed_text: { |
||||
query: 'sainte query value', |
||||
neighbourhood: 'sainte neighbourhood value', |
||||
borough: 'sainte borough value', |
||||
city: 'ste city ste value ste', |
||||
county: 'sainte county value', |
||||
state: 'sainte state value', |
||||
postalcode: 'sainte postalcode value', |
||||
country: 'sainte country value' |
||||
} |
||||
}; |
||||
|
||||
const messages = sanitizer(raw, clean); |
||||
|
||||
t.deepEquals(clean, expected_clean); |
||||
t.deepEquals(messages.errors, [], 'no errors'); |
||||
t.deepEquals(messages.warnings, [], 'no warnings'); |
||||
t.end(); |
||||
|
||||
}); |
||||
|
||||
test('\'ft\' should be expanded to \'fort\' wherever it appears in the city', function(t) { |
||||
const raw = {}; |
||||
|
||||
const clean = { |
||||
parsed_text: { |
||||
query: 'ft query value', |
||||
neighbourhood: 'ft neighbourhood value', |
||||
borough: 'ft borough value', |
||||
city: 'Ft city ft value fT', |
||||
county: 'ft county value', |
||||
state: 'ft state value', |
||||
postalcode: 'ft postalcode value', |
||||
country: 'ft country value' |
||||
} |
||||
}; |
||||
|
||||
const expected_clean = { |
||||
parsed_text: { |
||||
query: 'ft query value', |
||||
neighbourhood: 'ft neighbourhood value', |
||||
borough: 'ft borough value', |
||||
city: 'fort city fort value fort', |
||||
county: 'ft county value', |
||||
state: 'ft state value', |
||||
postalcode: 'ft postalcode value', |
||||
country: 'ft country value' |
||||
} |
||||
}; |
||||
|
||||
const messages = sanitizer(raw, clean); |
||||
|
||||
t.deepEquals(clean, expected_clean); |
||||
t.deepEquals(messages.errors, [], 'no errors'); |
||||
t.deepEquals(messages.warnings, [], 'no warnings'); |
||||
t.end(); |
||||
|
||||
}); |
||||
|
||||
test('\'mt\' should be expanded to \'mount\' wherever it appears in the city', function(t) { |
||||
const raw = {}; |
||||
|
||||
const clean = { |
||||
parsed_text: { |
||||
query: 'mt query value', |
||||
neighbourhood: 'mt neighbourhood value', |
||||
borough: 'mt borough value', |
||||
city: 'Mt city mt value mT', |
||||
county: 'mt county value', |
||||
state: 'mt state value', |
||||
postalcode: 'mt postalcode value', |
||||
country: 'mt country value' |
||||
} |
||||
}; |
||||
|
||||
const expected_clean = { |
||||
parsed_text: { |
||||
query: 'mt query value', |
||||
neighbourhood: 'mt neighbourhood value', |
||||
borough: 'mt borough value', |
||||
city: 'mount city mount value mount', |
||||
county: 'mt county value', |
||||
state: 'mt state value', |
||||
postalcode: 'mt postalcode value', |
||||
country: 'mt country value' |
||||
} |
||||
}; |
||||
|
||||
const messages = sanitizer(raw, clean); |
||||
|
||||
t.deepEquals(clean, expected_clean); |
||||
t.deepEquals(messages.errors, [], 'no errors'); |
||||
t.deepEquals(messages.warnings, [], 'no warnings'); |
||||
t.end(); |
||||
|
||||
}); |
||||
|
||||
test('mixture of \'mt\', \'ft\', \'saint\', and \'sainte\' should be expanded/abbreviated', function(t) { |
||||
const raw = {}; |
||||
|
||||
const clean = { |
||||
parsed_text: { |
||||
city: 'mt. ft saint sainte mt ft.' |
||||
} |
||||
}; |
||||
|
||||
const expected_clean = { |
||||
parsed_text: { |
||||
city: 'mount fort st ste mount fort' |
||||
} |
||||
}; |
||||
|
||||
const messages = sanitizer(raw, clean); |
||||
|
||||
t.deepEquals(clean, expected_clean); |
||||
t.deepEquals(messages.errors, [], 'no errors'); |
||||
t.deepEquals(messages.warnings, [], 'no warnings'); |
||||
t.end(); |
||||
|
||||
}); |
||||
|
||||
test('period word boundary on \'mt.\' should replace with a space', function(t) { |
||||
const raw = {}; |
||||
|
||||
const clean = { |
||||
parsed_text: { |
||||
city: 'mt.city' |
||||
} |
||||
}; |
||||
|
||||
const expected_clean = { |
||||
parsed_text: { |
||||
city: 'mount city' |
||||
} |
||||
}; |
||||
|
||||
const messages = sanitizer(raw, clean); |
||||
|
||||
t.deepEquals(clean, expected_clean); |
||||
t.deepEquals(messages.errors, [], 'no errors'); |
||||
t.deepEquals(messages.warnings, [], 'no warnings'); |
||||
t.end(); |
||||
|
||||
}); |
||||
|
||||
test('period word boundary on \'ft.\' should replace with a space', function(t) { |
||||
const raw = {}; |
||||
|
||||
const clean = { |
||||
parsed_text: { |
||||
city: 'ft.city' |
||||
} |
||||
}; |
||||
|
||||
const expected_clean = { |
||||
parsed_text: { |
||||
city: 'fort city' |
||||
} |
||||
}; |
||||
|
||||
const messages = sanitizer(raw, clean); |
||||
|
||||
t.deepEquals(clean, expected_clean); |
||||
t.deepEquals(messages.errors, [], 'no errors'); |
||||
t.deepEquals(messages.warnings, [], 'no warnings'); |
||||
t.end(); |
||||
|
||||
}); |
||||
|
||||
}; |
||||
|
||||
module.exports.all = function (tape, common) { |
||||
function test(name, testFunction) { |
||||
return tape('sanitizer _city_name_standardizer: ' + name, testFunction); |
||||
} |
||||
|
||||
for( const testCase in module.exports.tests ){ |
||||
module.exports.tests[testCase](test, common); |
||||
} |
||||
}; |
Loading…
Reference in new issue