From 8f6d820b430d8b42493a7201afe4c2e7d9163711 Mon Sep 17 00:00:00 2001 From: Stephen Hess Date: Tue, 13 Dec 2016 00:29:37 -0500 Subject: [PATCH] streamlined standardizer more accurate since it's smarter about where to replace periods --- sanitizer/_mount_saint_fort_standardizer.js | 33 +++++++--- .../_mount_saint_fort_standardizer.js | 60 +++++++++++++++++-- 2 files changed, 79 insertions(+), 14 deletions(-) diff --git a/sanitizer/_mount_saint_fort_standardizer.js b/sanitizer/_mount_saint_fort_standardizer.js index 1ee6c59a..72e8aa10 100644 --- a/sanitizer/_mount_saint_fort_standardizer.js +++ b/sanitizer/_mount_saint_fort_standardizer.js @@ -3,25 +3,42 @@ const _ = require('lodash'); // matches 'ft', 'mt', 'saint', and 'sainte' on word boundary const mountSaintFort = /\b([fm]t|sainte?)\b/g; -const translations = { - 'mt': 'mount', - 'ft': 'fort', - 'saint': 'st', +const transliterations = { + 'mt': 'mount', + 'ft': 'fort', + 'saint': 'st', 'sainte': 'ste' }; -function translate(match) { - return _.get(translations, match); +function transliterate(match) { + return _.get(transliterations, match); } +// transliterate ft/mt/saint/sainte to fort/mount/st/ste, respectively function sanitize(raw, clean) { // error & warning messages // this function doesn't add any error or warning messages const messages = { errors: [], warnings: [] }; + // only try to transliterate if there is a city in parsed_text if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) { - // replace ft/mt/saint/sainte with fort/mount/st/ste, respectively - clean.parsed_text.city = _.toLower(clean.parsed_text.city.replace(/\./g, '')).replace(mountSaintFort, translate); + // eg input: Ft. Saint Louis + // after 1. ft saint louis + // after 2. fort st louis + // after 3. fort st louis + + // 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier) + const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 '); + + // 2. transliterate 'saint'->'st', etc + const transliterated = periods_removed.replace(mountSaintFort, transliterate); + + // 3. whitespace-normalize by replacing much whitespace with a space and trimming + // duplicate whitespace can be introduced when removing periods + const whitespace_normalized = _.trimEnd(transliterated.replace(/\s+/, ' ')); + + clean.parsed_text.city = whitespace_normalized; + } return messages; diff --git a/test/unit/sanitizer/_mount_saint_fort_standardizer.js b/test/unit/sanitizer/_mount_saint_fort_standardizer.js index 7d069257..54e68fea 100644 --- a/test/unit/sanitizer/_mount_saint_fort_standardizer.js +++ b/test/unit/sanitizer/_mount_saint_fort_standardizer.js @@ -56,7 +56,7 @@ module.exports.tests.text_parser = function(test, common) { query: 'saint query value', neighbourhood: 'saint neighbourhood value', borough: 'saint borough value', - city: 'saint city saint value saint', + city: 'SainT city sAiNt value saInt', county: 'saint county value', state: 'saint state value', postalcode: 'saint postalcode value', @@ -94,7 +94,7 @@ module.exports.tests.text_parser = function(test, common) { query: 'sainte query value', neighbourhood: 'sainte neighbourhood value', borough: 'sainte borough value', - city: 'sainte city sainte value sainte', + city: 'SaintE city sAinTe value saINte', county: 'sainte county value', state: 'sainte state value', postalcode: 'sainte postalcode value', @@ -132,7 +132,7 @@ module.exports.tests.text_parser = function(test, common) { query: 'ft query value', neighbourhood: 'ft neighbourhood value', borough: 'ft borough value', - city: 'ft city ft value ft', + city: 'Ft city ft value fT', county: 'ft county value', state: 'ft state value', postalcode: 'ft postalcode value', @@ -170,7 +170,7 @@ module.exports.tests.text_parser = function(test, common) { query: 'mt query value', neighbourhood: 'mt neighbourhood value', borough: 'mt borough value', - city: 'mt city mt value mt', + city: 'Mt city mt value mT', county: 'mt county value', state: 'mt state value', postalcode: 'mt postalcode value', @@ -205,13 +205,61 @@ module.exports.tests.text_parser = function(test, common) { const clean = { parsed_text: { - city: 'mt. ft. saint sainte' + city: 'mt. ft saint sainte mt ft.' } }; const expected_clean = { parsed_text: { - city: 'mount fort st ste' + city: 'mount fort st ste mount fort' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('period word boundary on \'mt.\' should replace with a space', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + city: 'mt.city' + } + }; + + const expected_clean = { + parsed_text: { + city: 'mount city' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('period word boundary on \'ft.\' should replace with a space', function(t) { + const raw = {}; + + const clean = { + parsed_text: { + city: 'ft.city' + } + }; + + const expected_clean = { + parsed_text: { + city: 'fort city' } };