From d187e7d950b00c75fef4b6d5c2118e25b3da6328 Mon Sep 17 00:00:00 2001 From: Joxit Date: Sun, 21 Jan 2018 16:38:40 +0100 Subject: [PATCH 1/2] Fix for #1077, fail to search `Saint..` cities in structured queries --- sanitizer/_city_name_standardizer.js | 16 ++++++++-------- test/unit/sanitizer/_city_name_standardizer.js | 18 +++++++++--------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/sanitizer/_city_name_standardizer.js b/sanitizer/_city_name_standardizer.js index 6724eda1..236c0df7 100644 --- a/sanitizer/_city_name_standardizer.js +++ b/sanitizer/_city_name_standardizer.js @@ -1,13 +1,13 @@ const _ = require('lodash'); // matches 'ft', 'mt', 'saint', and 'sainte' on word boundary -const mountSaintFort = /\b([fm]t|sainte?)\b/g; +const mountSaintFort = /\b([fm]t|ste?)\b/g; const transliterations = { 'mt': 'mount', 'ft': 'fort', - 'saint': 'st', - 'sainte': 'ste' + 'st': 'saint', + 'ste': 'sainte' }; function transliterate(match) { @@ -22,15 +22,15 @@ function _sanitize(raw, clean) { // only try to transliterate if there is a city in parsed_text if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) { - // eg input: Ft. Saint Louis - // after 1. ft saint louis - // after 2. fort st louis - // after 3. fort st louis + // eg input: Ft. st Louis + // after 1. ft st louis + // after 2. fort saint louis + // after 3. fort saint louis // 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier) const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 '); - // 2. transliterate 'saint'->'st', etc + // 2. transliterate 'st'->'saint', etc const transliterated = periods_removed.replace(mountSaintFort, transliterate); // 3. reduce whitespace sequences that can occur when removing periods down to a single space diff --git a/test/unit/sanitizer/_city_name_standardizer.js b/test/unit/sanitizer/_city_name_standardizer.js index 5a01e58b..9ae5917e 100644 --- a/test/unit/sanitizer/_city_name_standardizer.js +++ b/test/unit/sanitizer/_city_name_standardizer.js @@ -48,7 +48,7 @@ module.exports.tests.text_parser = function(test, common) { }); - test('\'saint\' should be abbreviated to \'st\' wherever it appears in the city', function(t) { + test('\'st\' should be expanded to \'saint\' wherever it appears in the city', function(t) { const raw = {}; const clean = { @@ -56,7 +56,7 @@ module.exports.tests.text_parser = function(test, common) { query: 'saint query value', neighbourhood: 'saint neighbourhood value', borough: 'saint borough value', - city: 'SainT city sAiNt value saInt', + city: 'st city ST value St', county: 'saint county value', state: 'saint state value', postalcode: 'saint postalcode value', @@ -69,7 +69,7 @@ module.exports.tests.text_parser = function(test, common) { query: 'saint query value', neighbourhood: 'saint neighbourhood value', borough: 'saint borough value', - city: 'st city st value st', + city: 'saint city saint value saint', county: 'saint county value', state: 'saint state value', postalcode: 'saint postalcode value', @@ -86,7 +86,7 @@ module.exports.tests.text_parser = function(test, common) { }); - test('\'sainte\' should be abbreviated to \'ste\' wherever it appears in the city', function(t) { + test('\'ste\' should be expanded to \'sainte\' wherever it appears in the city', function(t) { const raw = {}; const clean = { @@ -94,7 +94,7 @@ module.exports.tests.text_parser = function(test, common) { query: 'sainte query value', neighbourhood: 'sainte neighbourhood value', borough: 'sainte borough value', - city: 'SaintE city sAinTe value saINte', + city: 'ste city STE value StE', county: 'sainte county value', state: 'sainte state value', postalcode: 'sainte postalcode value', @@ -107,7 +107,7 @@ module.exports.tests.text_parser = function(test, common) { query: 'sainte query value', neighbourhood: 'sainte neighbourhood value', borough: 'sainte borough value', - city: 'ste city ste value ste', + city: 'sainte city sainte value sainte', county: 'sainte county value', state: 'sainte state value', postalcode: 'sainte postalcode value', @@ -200,18 +200,18 @@ module.exports.tests.text_parser = function(test, common) { }); - test('mixture of \'mt\', \'ft\', \'saint\', and \'sainte\' should be expanded/abbreviated', function(t) { + test('mixture of \'mt\', \'ft\', \'st\', and \'st\' should be expanded', function(t) { const raw = {}; const clean = { parsed_text: { - city: 'mt. ft saint sainte mt ft.' + city: 'mt. ft st ste mt ft.' } }; const expected_clean = { parsed_text: { - city: 'mount fort st ste mount fort' + city: 'mount fort saint sainte mount fort' } }; From 551f0227dbe16827aa92f28a44c0f1a76ac4343a Mon Sep 17 00:00:00 2001 From: Joxit Date: Wed, 28 Feb 2018 13:52:56 +0100 Subject: [PATCH 2/2] Transliterates Saint and Sainte in ES schema see pelias/schema#268 --- sanitizer/_city_name_standardizer.js | 18 ++-- .../unit/sanitizer/_city_name_standardizer.js | 82 +------------------ 2 files changed, 11 insertions(+), 89 deletions(-) diff --git a/sanitizer/_city_name_standardizer.js b/sanitizer/_city_name_standardizer.js index 236c0df7..e4604474 100644 --- a/sanitizer/_city_name_standardizer.js +++ b/sanitizer/_city_name_standardizer.js @@ -1,20 +1,18 @@ const _ = require('lodash'); -// matches 'ft', 'mt', 'saint', and 'sainte' on word boundary -const mountSaintFort = /\b([fm]t|ste?)\b/g; +// matches 'ft', 'mt' on word boundary +const mountFort = /\b([fm]t)\b/g; const transliterations = { 'mt': 'mount', - 'ft': 'fort', - 'st': 'saint', - 'ste': 'sainte' + 'ft': 'fort' }; function transliterate(match) { return _.get(transliterations, match); } -// transliterate ft/mt/saint/sainte to fort/mount/st/ste, respectively +// transliterate ft/mt to fort/mount, respectively function _sanitize(raw, clean) { // error & warning messages // this function doesn't add any error or warning messages @@ -24,14 +22,14 @@ function _sanitize(raw, clean) { if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) { // eg input: Ft. st Louis // after 1. ft st louis - // after 2. fort saint louis - // after 3. fort saint louis + // after 2. fort st louis + // after 3. fort st louis // 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier) const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 '); - // 2. transliterate 'st'->'saint', etc - const transliterated = periods_removed.replace(mountSaintFort, transliterate); + // 2. transliterate 'ft'->'fort', etc + const transliterated = periods_removed.replace(mountFort, transliterate); // 3. reduce whitespace sequences that can occur when removing periods down to a single space const whitespace_normalized = _.trimEnd(transliterated.replace(/\s+/, ' ')); diff --git a/test/unit/sanitizer/_city_name_standardizer.js b/test/unit/sanitizer/_city_name_standardizer.js index 9ae5917e..9e168085 100644 --- a/test/unit/sanitizer/_city_name_standardizer.js +++ b/test/unit/sanitizer/_city_name_standardizer.js @@ -48,82 +48,6 @@ module.exports.tests.text_parser = function(test, common) { }); - test('\'st\' should be expanded to \'saint\' wherever it appears in the city', function(t) { - const raw = {}; - - const clean = { - parsed_text: { - query: 'saint query value', - neighbourhood: 'saint neighbourhood value', - borough: 'saint borough value', - city: 'st city ST value St', - county: 'saint county value', - state: 'saint state value', - postalcode: 'saint postalcode value', - country: 'saint country value' - } - }; - - const expected_clean = { - parsed_text: { - query: 'saint query value', - neighbourhood: 'saint neighbourhood value', - borough: 'saint borough value', - city: 'saint city saint value saint', - county: 'saint county value', - state: 'saint state value', - postalcode: 'saint postalcode value', - country: 'saint country value' - } - }; - - const messages = sanitizer.sanitize(raw, clean); - - t.deepEquals(clean, expected_clean); - t.deepEquals(messages.errors, [], 'no errors'); - t.deepEquals(messages.warnings, [], 'no warnings'); - t.end(); - - }); - - test('\'ste\' should be expanded to \'sainte\' wherever it appears in the city', function(t) { - const raw = {}; - - const clean = { - parsed_text: { - query: 'sainte query value', - neighbourhood: 'sainte neighbourhood value', - borough: 'sainte borough value', - city: 'ste city STE value StE', - county: 'sainte county value', - state: 'sainte state value', - postalcode: 'sainte postalcode value', - country: 'sainte country value' - } - }; - - const expected_clean = { - parsed_text: { - query: 'sainte query value', - neighbourhood: 'sainte neighbourhood value', - borough: 'sainte borough value', - city: 'sainte city sainte value sainte', - county: 'sainte county value', - state: 'sainte state value', - postalcode: 'sainte postalcode value', - country: 'sainte country value' - } - }; - - const messages = sanitizer.sanitize(raw, clean); - - t.deepEquals(clean, expected_clean); - t.deepEquals(messages.errors, [], 'no errors'); - t.deepEquals(messages.warnings, [], 'no warnings'); - t.end(); - - }); - test('\'ft\' should be expanded to \'fort\' wherever it appears in the city', function(t) { const raw = {}; @@ -200,18 +124,18 @@ module.exports.tests.text_parser = function(test, common) { }); - test('mixture of \'mt\', \'ft\', \'st\', and \'st\' should be expanded', function(t) { + test('mixture of \'mt\', \'ft\' should be expanded', function(t) { const raw = {}; const clean = { parsed_text: { - city: 'mt. ft st ste mt ft.' + city: 'mt. ft mt ft.' } }; const expected_clean = { parsed_text: { - city: 'mount fort saint sainte mount fort' + city: 'mount fort mount fort' } };