Browse Source

Fix for #1077, fail to search `Saint..` cities in structured queries

pull/1091/head
Joxit 7 years ago committed by Julian Simioni
parent
commit
d187e7d950
No known key found for this signature in database
GPG Key ID: B9EEB0C6EE0910A1
  1. 16
      sanitizer/_city_name_standardizer.js
  2. 18
      test/unit/sanitizer/_city_name_standardizer.js

16
sanitizer/_city_name_standardizer.js

@ -1,13 +1,13 @@
const _ = require('lodash'); const _ = require('lodash');
// matches 'ft', 'mt', 'saint', and 'sainte' on word boundary // matches 'ft', 'mt', 'saint', and 'sainte' on word boundary
const mountSaintFort = /\b([fm]t|sainte?)\b/g; const mountSaintFort = /\b([fm]t|ste?)\b/g;
const transliterations = { const transliterations = {
'mt': 'mount', 'mt': 'mount',
'ft': 'fort', 'ft': 'fort',
'saint': 'st', 'st': 'saint',
'sainte': 'ste' 'ste': 'sainte'
}; };
function transliterate(match) { function transliterate(match) {
@ -22,15 +22,15 @@ function _sanitize(raw, clean) {
// only try to transliterate if there is a city in parsed_text // only try to transliterate if there is a city in parsed_text
if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) { if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) {
// eg input: Ft. Saint Louis // eg input: Ft. st Louis
// after 1. ft saint louis // after 1. ft st louis
// after 2. fort st louis // after 2. fort saint louis
// after 3. fort st louis // after 3. fort saint louis
// 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier) // 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier)
const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 '); const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 ');
// 2. transliterate 'saint'->'st', etc // 2. transliterate 'st'->'saint', etc
const transliterated = periods_removed.replace(mountSaintFort, transliterate); const transliterated = periods_removed.replace(mountSaintFort, transliterate);
// 3. reduce whitespace sequences that can occur when removing periods down to a single space // 3. reduce whitespace sequences that can occur when removing periods down to a single space

18
test/unit/sanitizer/_city_name_standardizer.js

@ -48,7 +48,7 @@ module.exports.tests.text_parser = function(test, common) {
}); });
test('\'saint\' should be abbreviated to \'st\' wherever it appears in the city', function(t) { test('\'st\' should be expanded to \'saint\' wherever it appears in the city', function(t) {
const raw = {}; const raw = {};
const clean = { const clean = {
@ -56,7 +56,7 @@ module.exports.tests.text_parser = function(test, common) {
query: 'saint query value', query: 'saint query value',
neighbourhood: 'saint neighbourhood value', neighbourhood: 'saint neighbourhood value',
borough: 'saint borough value', borough: 'saint borough value',
city: 'SainT city sAiNt value saInt', city: 'st city ST value St',
county: 'saint county value', county: 'saint county value',
state: 'saint state value', state: 'saint state value',
postalcode: 'saint postalcode value', postalcode: 'saint postalcode value',
@ -69,7 +69,7 @@ module.exports.tests.text_parser = function(test, common) {
query: 'saint query value', query: 'saint query value',
neighbourhood: 'saint neighbourhood value', neighbourhood: 'saint neighbourhood value',
borough: 'saint borough value', borough: 'saint borough value',
city: 'st city st value st', city: 'saint city saint value saint',
county: 'saint county value', county: 'saint county value',
state: 'saint state value', state: 'saint state value',
postalcode: 'saint postalcode value', postalcode: 'saint postalcode value',
@ -86,7 +86,7 @@ module.exports.tests.text_parser = function(test, common) {
}); });
test('\'sainte\' should be abbreviated to \'ste\' wherever it appears in the city', function(t) { test('\'ste\' should be expanded to \'sainte\' wherever it appears in the city', function(t) {
const raw = {}; const raw = {};
const clean = { const clean = {
@ -94,7 +94,7 @@ module.exports.tests.text_parser = function(test, common) {
query: 'sainte query value', query: 'sainte query value',
neighbourhood: 'sainte neighbourhood value', neighbourhood: 'sainte neighbourhood value',
borough: 'sainte borough value', borough: 'sainte borough value',
city: 'SaintE city sAinTe value saINte', city: 'ste city STE value StE',
county: 'sainte county value', county: 'sainte county value',
state: 'sainte state value', state: 'sainte state value',
postalcode: 'sainte postalcode value', postalcode: 'sainte postalcode value',
@ -107,7 +107,7 @@ module.exports.tests.text_parser = function(test, common) {
query: 'sainte query value', query: 'sainte query value',
neighbourhood: 'sainte neighbourhood value', neighbourhood: 'sainte neighbourhood value',
borough: 'sainte borough value', borough: 'sainte borough value',
city: 'ste city ste value ste', city: 'sainte city sainte value sainte',
county: 'sainte county value', county: 'sainte county value',
state: 'sainte state value', state: 'sainte state value',
postalcode: 'sainte postalcode value', postalcode: 'sainte postalcode value',
@ -200,18 +200,18 @@ module.exports.tests.text_parser = function(test, common) {
}); });
test('mixture of \'mt\', \'ft\', \'saint\', and \'sainte\' should be expanded/abbreviated', function(t) { test('mixture of \'mt\', \'ft\', \'st\', and \'st\' should be expanded', function(t) {
const raw = {}; const raw = {};
const clean = { const clean = {
parsed_text: { parsed_text: {
city: 'mt. ft saint sainte mt ft.' city: 'mt. ft st ste mt ft.'
} }
}; };
const expected_clean = { const expected_clean = {
parsed_text: { parsed_text: {
city: 'mount fort st ste mount fort' city: 'mount fort saint sainte mount fort'
} }
}; };

Loading…
Cancel
Save