Browse Source

streamlined standardizer

more accurate since it's smarter about where to replace periods
pull/767/head
Stephen Hess 8 years ago
parent
commit
8f6d820b43
  1. 27
      sanitizer/_mount_saint_fort_standardizer.js
  2. 60
      test/unit/sanitizer/_mount_saint_fort_standardizer.js

27
sanitizer/_mount_saint_fort_standardizer.js

@ -3,25 +3,42 @@ const _ = require('lodash');
// matches 'ft', 'mt', 'saint', and 'sainte' on word boundary // matches 'ft', 'mt', 'saint', and 'sainte' on word boundary
const mountSaintFort = /\b([fm]t|sainte?)\b/g; const mountSaintFort = /\b([fm]t|sainte?)\b/g;
const translations = { const transliterations = {
'mt': 'mount', 'mt': 'mount',
'ft': 'fort', 'ft': 'fort',
'saint': 'st', 'saint': 'st',
'sainte': 'ste' 'sainte': 'ste'
}; };
function translate(match) { function transliterate(match) {
return _.get(translations, match); return _.get(transliterations, match);
} }
// transliterate ft/mt/saint/sainte to fort/mount/st/ste, respectively
function sanitize(raw, clean) { function sanitize(raw, clean) {
// error & warning messages // error & warning messages
// this function doesn't add any error or warning messages // this function doesn't add any error or warning messages
const messages = { errors: [], warnings: [] }; const messages = { errors: [], warnings: [] };
// only try to transliterate if there is a city in parsed_text
if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) { if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) {
// replace ft/mt/saint/sainte with fort/mount/st/ste, respectively // eg input: Ft. Saint Louis
clean.parsed_text.city = _.toLower(clean.parsed_text.city.replace(/\./g, '')).replace(mountSaintFort, translate); // after 1. ft saint louis
// after 2. fort st louis
// after 3. fort st louis
// 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier)
const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 ');
// 2. transliterate 'saint'->'st', etc
const transliterated = periods_removed.replace(mountSaintFort, transliterate);
// 3. whitespace-normalize by replacing much whitespace with a space and trimming
// duplicate whitespace can be introduced when removing periods
const whitespace_normalized = _.trimEnd(transliterated.replace(/\s+/, ' '));
clean.parsed_text.city = whitespace_normalized;
} }
return messages; return messages;

60
test/unit/sanitizer/_mount_saint_fort_standardizer.js

@ -56,7 +56,7 @@ module.exports.tests.text_parser = function(test, common) {
query: 'saint query value', query: 'saint query value',
neighbourhood: 'saint neighbourhood value', neighbourhood: 'saint neighbourhood value',
borough: 'saint borough value', borough: 'saint borough value',
city: 'saint city saint value saint', city: 'SainT city sAiNt value saInt',
county: 'saint county value', county: 'saint county value',
state: 'saint state value', state: 'saint state value',
postalcode: 'saint postalcode value', postalcode: 'saint postalcode value',
@ -94,7 +94,7 @@ module.exports.tests.text_parser = function(test, common) {
query: 'sainte query value', query: 'sainte query value',
neighbourhood: 'sainte neighbourhood value', neighbourhood: 'sainte neighbourhood value',
borough: 'sainte borough value', borough: 'sainte borough value',
city: 'sainte city sainte value sainte', city: 'SaintE city sAinTe value saINte',
county: 'sainte county value', county: 'sainte county value',
state: 'sainte state value', state: 'sainte state value',
postalcode: 'sainte postalcode value', postalcode: 'sainte postalcode value',
@ -132,7 +132,7 @@ module.exports.tests.text_parser = function(test, common) {
query: 'ft query value', query: 'ft query value',
neighbourhood: 'ft neighbourhood value', neighbourhood: 'ft neighbourhood value',
borough: 'ft borough value', borough: 'ft borough value',
city: 'ft city ft value ft', city: 'Ft city ft value fT',
county: 'ft county value', county: 'ft county value',
state: 'ft state value', state: 'ft state value',
postalcode: 'ft postalcode value', postalcode: 'ft postalcode value',
@ -170,7 +170,7 @@ module.exports.tests.text_parser = function(test, common) {
query: 'mt query value', query: 'mt query value',
neighbourhood: 'mt neighbourhood value', neighbourhood: 'mt neighbourhood value',
borough: 'mt borough value', borough: 'mt borough value',
city: 'mt city mt value mt', city: 'Mt city mt value mT',
county: 'mt county value', county: 'mt county value',
state: 'mt state value', state: 'mt state value',
postalcode: 'mt postalcode value', postalcode: 'mt postalcode value',
@ -205,13 +205,61 @@ module.exports.tests.text_parser = function(test, common) {
const clean = { const clean = {
parsed_text: { parsed_text: {
city: 'mt. ft. saint sainte' city: 'mt. ft saint sainte mt ft.'
} }
}; };
const expected_clean = { const expected_clean = {
parsed_text: { parsed_text: {
city: 'mount fort st ste' city: 'mount fort st ste mount fort'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('period word boundary on \'mt.\' should replace with a space', function(t) {
const raw = {};
const clean = {
parsed_text: {
city: 'mt.city'
}
};
const expected_clean = {
parsed_text: {
city: 'mount city'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('period word boundary on \'ft.\' should replace with a space', function(t) {
const raw = {};
const clean = {
parsed_text: {
city: 'ft.city'
}
};
const expected_clean = {
parsed_text: {
city: 'fort city'
} }
}; };

Loading…
Cancel
Save