From d721101f700799d3203296e661b337f7e646d97a Mon Sep 17 00:00:00 2001 From: Stephen Hess Date: Wed, 23 Nov 2016 00:11:59 -0500 Subject: [PATCH 1/6] tie query to specific hash --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index d9a588d1..80fd13e3 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-labels": "1.5.1", "pelias-logger": "0.1.0", "pelias-model": "4.3.0", - "pelias-query": "8.9.0", + "pelias-query": "pelias/query#5338af9", "pelias-text-analyzer": "1.6.0", "stats-lite": "2.0.3", "through2": "2.0.1" From da72a968f9e619e0604ab5be33271fb1d8c3ea5f Mon Sep 17 00:00:00 2001 From: Stephen Hess Date: Wed, 23 Nov 2016 00:12:57 -0500 Subject: [PATCH 2/6] switch importance of address/street --- middleware/trimByGranularityComponent.js | 4 ++-- .../middleware/trimByGranularityComponent.js | 24 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/middleware/trimByGranularityComponent.js b/middleware/trimByGranularityComponent.js index afef01f0..1fe94ce8 100644 --- a/middleware/trimByGranularityComponent.js +++ b/middleware/trimByGranularityComponent.js @@ -22,8 +22,8 @@ const _ = require('lodash'); // supplied we want to retain borough=Manhattan and city=Manhattan results const layers = [ 'venue', - 'street', 'address', + 'street', 'neighbourhood', ['borough', 'locality'], 'localadmin', @@ -44,8 +44,8 @@ const layers = [ // city=Manhattan const explicit_borough_layers = [ 'venue', - 'street', 'address', + 'street', 'neighbourhood', 'borough', 'locality', diff --git a/test/unit/middleware/trimByGranularityComponent.js b/test/unit/middleware/trimByGranularityComponent.js index 69c80e92..d90a3803 100644 --- a/test/unit/middleware/trimByGranularityComponent.js +++ b/test/unit/middleware/trimByGranularityComponent.js @@ -19,8 +19,8 @@ module.exports.tests.trimByGranularity = function(test, common) { data: [ { name: 'venue 1', _matched_queries: ['fallback.venue'] }, { name: 'venue 2', _matched_queries: ['fallback.venue'] }, - { name: 'street 1', _matched_queries: ['fallback.street'] }, { name: 'address 1', _matched_queries: ['fallback.address'] }, + { name: 'street 1', _matched_queries: ['fallback.street'] }, { name: 'neighbourhood 1', _matched_queries: ['fallback.neighbourhood'] }, { name: 'borough 1', _matched_queries: ['fallback.borough'] }, { name: 'locality 1', _matched_queries: ['fallback.locality'] }, @@ -55,9 +55,9 @@ module.exports.tests.trimByGranularity = function(test, common) { var res = { data: [ - { name: 'street 1', _matched_queries: ['fallback.street'] }, - { name: 'street 2', _matched_queries: ['fallback.street'] }, { name: 'address 1', _matched_queries: ['fallback.address'] }, + { name: 'address 2', _matched_queries: ['fallback.address'] }, + { name: 'street 1', _matched_queries: ['fallback.street'] }, { name: 'neighbourhood 1', _matched_queries: ['fallback.neighbourhood'] }, { name: 'borough 1', _matched_queries: ['fallback.borough'] }, { name: 'locality 1', _matched_queries: ['fallback.locality'] }, @@ -73,13 +73,13 @@ module.exports.tests.trimByGranularity = function(test, common) { }; var expected_data = [ - { name: 'street 1', _matched_queries: ['fallback.street'] }, - { name: 'street 2', _matched_queries: ['fallback.street'] } + { name: 'address 1', _matched_queries: ['fallback.address'] }, + { name: 'address 2', _matched_queries: ['fallback.address'] } ]; function testIt() { trimByGranularity(req, res, function() { - t.deepEquals(res.data, expected_data, 'only street records should be here'); + t.deepEquals(res.data, expected_data, 'only address records should be here'); t.end(); }); } @@ -87,13 +87,13 @@ module.exports.tests.trimByGranularity = function(test, common) { testIt(); }); - test('all records with fallback.* matched_queries name should retain only address when they are most granular', function(t) { + test('all records with fallback.* matched_queries name should retain only street when they are most granular', function(t) { var req = { clean: {} }; var res = { data: [ - { name: 'address 1', _matched_queries: ['fallback.address'] }, - { name: 'address 2', _matched_queries: ['fallback.address'] }, + { name: 'street 1', _matched_queries: ['fallback.street'] }, + { name: 'street 2', _matched_queries: ['fallback.street'] }, { name: 'neighbourhood 1', _matched_queries: ['fallback.neighbourhood'] }, { name: 'borough 1', _matched_queries: ['fallback.borough'] }, { name: 'locality 1', _matched_queries: ['fallback.locality'] }, @@ -109,13 +109,13 @@ module.exports.tests.trimByGranularity = function(test, common) { }; var expected_data = [ - { name: 'address 1', _matched_queries: ['fallback.address'] }, - { name: 'address 2', _matched_queries: ['fallback.address'] }, + { name: 'street 1', _matched_queries: ['fallback.street'] }, + { name: 'street 2', _matched_queries: ['fallback.street'] }, ]; function testIt() { trimByGranularity(req, res, function() { - t.deepEquals(res.data, expected_data, 'only address records should be here'); + t.deepEquals(res.data, expected_data, 'only street records should be here'); t.end(); }); } From 7aad07368debee6e637acf17a617c18c790b5135 Mon Sep 17 00:00:00 2001 From: Stephen Hess Date: Wed, 23 Nov 2016 00:13:28 -0500 Subject: [PATCH 3/6] modified tests to accommodate changes to ComponentFallbackQuery --- .../fixture/component_geocoding/fallback.json | 193 +++++++++++++++++- test/unit/query/component_geocoding.js | 9 - 2 files changed, 192 insertions(+), 10 deletions(-) diff --git a/test/unit/fixture/component_geocoding/fallback.json b/test/unit/fixture/component_geocoding/fallback.json index 11891881..965b188f 100644 --- a/test/unit/fixture/component_geocoding/fallback.json +++ b/test/unit/fixture/component_geocoding/fallback.json @@ -6,6 +6,197 @@ "query": { "bool": { "should": [ + { + "bool": { + "_name": "fallback.address", + "must": [ + { + "match_phrase": { + "address_parts.number": "number value" + } + }, + { + "match_phrase": { + "address_parts.street": "street value" + } + }, + { + "multi_match": { + "query": "neighbourhood value", + "type": "phrase", + "fields": [ + "parent.neighbourhood", + "parent.neighbourhood_a" + ] + } + }, + { + "multi_match": { + "query": "borough value", + "type": "phrase", + "fields": [ + "parent.borough", + "parent.borough_a" + ] + } + }, + { + "multi_match": { + "query": "city value", + "type": "phrase", + "fields": [ + "parent.locality", + "parent.locality_a", + "parent.localadmin", + "parent.localadmin_a" + ] + } + }, + { + "multi_match": { + "query": "county value", + "type": "phrase", + "fields": [ + "parent.county", + "parent.county_a", + "parent.macrocounty", + "parent.macrocounty_a" + ] + } + }, + { + "multi_match": { + "query": "state value", + "type": "phrase", + "fields": [ + "parent.region", + "parent.region_a", + "parent.macroregion", + "parent.macroregion_a" + ] + } + }, + { + "multi_match": { + "query": "country value", + "type": "phrase", + "fields": [ + "parent.country", + "parent.country_a", + "parent.dependency", + "parent.dependency_a" + ] + } + } + ], + "should": [ + { + "match_phrase": { + "address_parts.zip": "postalcode value" + } + } + ], + "filter": { + "term": { + "layer": "address" + } + }, + "boost": 50 + } + }, + { + "bool": { + "_name": "fallback.street", + "must": [ + { + "match_phrase": { + "address_parts.street": "street value" + } + }, + { + "multi_match": { + "query": "neighbourhood value", + "type": "phrase", + "fields": [ + "parent.neighbourhood", + "parent.neighbourhood_a" + ] + } + }, + { + "multi_match": { + "query": "borough value", + "type": "phrase", + "fields": [ + "parent.borough", + "parent.borough_a" + ] + } + }, + { + "multi_match": { + "query": "city value", + "type": "phrase", + "fields": [ + "parent.locality", + "parent.locality_a", + "parent.localadmin", + "parent.localadmin_a" + ] + } + }, + { + "multi_match": { + "query": "county value", + "type": "phrase", + "fields": [ + "parent.county", + "parent.county_a", + "parent.macrocounty", + "parent.macrocounty_a" + ] + } + }, + { + "multi_match": { + "query": "state value", + "type": "phrase", + "fields": [ + "parent.region", + "parent.region_a", + "parent.macroregion", + "parent.macroregion_a" + ] + } + }, + { + "multi_match": { + "query": "country value", + "type": "phrase", + "fields": [ + "parent.country", + "parent.country_a", + "parent.dependency", + "parent.dependency_a" + ] + } + } + ], + "should": [ + { + "match_phrase": { + "address_parts.zip": "postalcode value" + } + } + ], + "filter": { + "term": { + "layer": "street" + } + }, + "boost": 100 + } + }, { "bool": { "_name": "fallback.neighbourhood", @@ -514,4 +705,4 @@ ], "size": 20, "track_scores": true -} \ No newline at end of file +} diff --git a/test/unit/query/component_geocoding.js b/test/unit/query/component_geocoding.js index 68bedf6a..42e4736d 100644 --- a/test/unit/query/component_geocoding.js +++ b/test/unit/query/component_geocoding.js @@ -14,7 +14,6 @@ module.exports.tests.query = function(test, common) { test('valid search + focus + bbox', function(t) { var clean = { parsed_text: { - street: 'street value' }, text: 'test', querySize: 10, @@ -39,7 +38,6 @@ module.exports.tests.query = function(test, common) { test('valid search + bbox', function(t) { var clean = { parsed_text: { - street: 'street value' }, text: 'test', querySize: 10, @@ -63,7 +61,6 @@ module.exports.tests.query = function(test, common) { test('valid lingustic-only search', function(t) { var clean = { parsed_text: { - street: 'street value' }, text: 'test', querySize: 10, layers: ['test'] @@ -82,7 +79,6 @@ module.exports.tests.query = function(test, common) { test('search search + focus', function(t) { var clean = { parsed_text: { - street: 'street value' }, text: 'test', querySize: 10, 'focus.point.lat': 29.49136, 'focus.point.lon': -82.50622, @@ -102,7 +98,6 @@ module.exports.tests.query = function(test, common) { test('search search + viewport', function(t) { var clean = { parsed_text: { - street: 'street value' }, text: 'test', querySize: 10, 'focus.viewport.min_lat': 28.49136, @@ -127,7 +122,6 @@ module.exports.tests.query = function(test, common) { test('search with viewport diagonal < 1km should set scale to 1km', function(t) { var clean = { parsed_text: { - street: 'street value' }, text: 'test', querySize: 10, 'focus.viewport.min_lat': 28.49135, @@ -150,7 +144,6 @@ module.exports.tests.query = function(test, common) { test('search search + focus on null island', function(t) { var clean = { parsed_text: { - street: 'street value' }, text: 'test', querySize: 10, 'focus.point.lat': 0, 'focus.point.lon': 0, @@ -198,7 +191,6 @@ module.exports.tests.query = function(test, common) { test('valid boundary.country search', function(t) { var clean = { parsed_text: { - street: 'street value' }, text: 'test', querySize: 10, layers: ['test'], @@ -218,7 +210,6 @@ module.exports.tests.query = function(test, common) { test('valid sources filter', function(t) { var clean = { parsed_text: { - street: 'street value' }, 'text': 'test', 'sources': ['test_source'] From 75366f98c420c8b4fd3079bf35ff3aa24837e139 Mon Sep 17 00:00:00 2001 From: Stephen Hess Date: Wed, 23 Nov 2016 00:14:25 -0500 Subject: [PATCH 4/6] added text-analyzer call to parse supplied `address` field --- sanitizer/_synthesize_analysis.js | 28 +++++ test/unit/sanitizer/_synthesize_analysis.js | 124 +++++++++++++++++++- 2 files changed, 149 insertions(+), 3 deletions(-) diff --git a/sanitizer/_synthesize_analysis.js b/sanitizer/_synthesize_analysis.js index 04ffeb34..79c53187 100644 --- a/sanitizer/_synthesize_analysis.js +++ b/sanitizer/_synthesize_analysis.js @@ -1,4 +1,5 @@ const _ = require('lodash'); +const text_analyzer = require('pelias-text-analyzer'); const fields = { 'address': 'address', @@ -20,6 +21,15 @@ function isPostalCodeOnly(parsed_text) { parsed_text.hasOwnProperty('postalcode'); } +function getHouseNumberField(analyzed_address) { + for (var field of ['number', 'postalcode']) { + if (analyzed_address.hasOwnProperty(field)) { + return field; + } + } + +} + function sanitize( raw, clean ){ // error & warning messages @@ -43,6 +53,24 @@ function sanitize( raw, clean ){ `at least one of the following fields is required: ${Object.keys(fields).join(', ')}`); } + if (clean.parsed_text.hasOwnProperty('address')) { + var analyzed_address = text_analyzer.parse(clean.parsed_text.address); + + const house_number_field = getHouseNumberField(analyzed_address); + + if (house_number_field) { + clean.parsed_text.number = analyzed_address[house_number_field]; + + clean.parsed_text.street = _.trim(_.replace(clean.parsed_text.address, clean.parsed_text.number, '')); + delete clean.parsed_text.address; + + } else { + clean.parsed_text.street = clean.parsed_text.address; + delete clean.parsed_text.address; + } + + } + return messages; } diff --git a/test/unit/sanitizer/_synthesize_analysis.js b/test/unit/sanitizer/_synthesize_analysis.js index 20f05fc0..dabacef8 100644 --- a/test/unit/sanitizer/_synthesize_analysis.js +++ b/test/unit/sanitizer/_synthesize_analysis.js @@ -1,13 +1,18 @@ -const sanitizer = require('../../../sanitizer/_synthesize_analysis'); const _ = require('lodash'); +const proxyquire = require('proxyquire').noCallThru(); module.exports.tests = {}; module.exports.tests.text_parser = function(test, common) { test('all variables should be parsed', function(t) { + var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', { + 'pelias-text-analyzer': { parse: function(query) { + t.fail('parse should not have been called'); + } + }}); + const raw = { query: ' \t query \t value \t ', - address: ' \t address \t value \t ', neighbourhood: ' \t neighbourhood \t value \t ', borough: ' \t borough \t value \t ', locality: ' \t locality \t value \t ', @@ -21,7 +26,6 @@ module.exports.tests.text_parser = function(test, common) { const expected_clean = { parsed_text: { - address: 'address value', neighbourhood: 'neighbourhood value', borough: 'borough value', city: 'locality value', @@ -42,6 +46,12 @@ module.exports.tests.text_parser = function(test, common) { }); test('non-string and blank string values should be treated as not supplied', function(t) { + var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', { + 'pelias-text-analyzer': { parse: function(query) { + t.fail('parse should not have been called'); + } + }}); + // helper to return a random value that's considered invalid function getInvalidValue() { return _.sample([{}, [], false, '', ' \t ', 17, undefined]); @@ -75,6 +85,12 @@ module.exports.tests.text_parser = function(test, common) { }); test('no supplied fields should return error', function(t) { + var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', { + 'pelias-text-analyzer': { parse: function(query) { + t.fail('parse should not have been called'); + } + }}); + const raw = {}; const clean = {}; @@ -92,6 +108,12 @@ module.exports.tests.text_parser = function(test, common) { }); test('postalcode-only parsed_text should return error', function(t) { + var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', { + 'pelias-text-analyzer': { parse: function(query) { + t.fail('parse should not have been called'); + } + }}); + const raw = { postalcode: 'postalcode value' }; @@ -113,6 +135,102 @@ module.exports.tests.text_parser = function(test, common) { }); + test('text_analyzer identifying house number should extract it and street', function(t) { + var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', { + 'pelias-text-analyzer': { parse: function(query) { + t.equals(query, 'Number Value Street Value Number Value'); + + return { + number: 'Number Value' + }; + } + }}); + + const raw = { + address: 'Number Value Street Value Number Value' + }; + + const clean = {}; + + const expected_clean = { + parsed_text: { + number: 'Number Value', + street: 'Street Value Number Value' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('text_analyzer identifying postalcode but not house number should assign to number and remove from address', function(t) { + var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', { + 'pelias-text-analyzer': { parse: function(query) { + t.equals(query, 'Number Value Street Value Number Value'); + + return { + postalcode: 'Number Value' + }; + } + }}); + + const raw = { + address: 'Number Value Street Value Number Value' + }; + + const clean = {}; + + const expected_clean = { + parsed_text: { + number: 'Number Value', + street: 'Street Value Number Value' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + + test('text_analyzer not revealing possible number should move address to street', function(t) { + var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', { + 'pelias-text-analyzer': { parse: function(query) { + t.equals(query, 'Street Value'); + + return {}; + } + }}); + + const raw = { + address: 'Street Value' + }; + + const clean = {}; + + const expected_clean = { + parsed_text: { + street: 'Street Value' + } + }; + + const messages = sanitizer(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + + }); + }; module.exports.all = function (tape, common) { From 26b305e0d90bcad7d3f260a8e49509636e3504eb Mon Sep 17 00:00:00 2001 From: Stephen Hess Date: Tue, 29 Nov 2016 10:54:59 -0500 Subject: [PATCH 5/6] bumped query version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 80fd13e3..0e18e8c6 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-labels": "1.5.1", "pelias-logger": "0.1.0", "pelias-model": "4.3.0", - "pelias-query": "pelias/query#5338af9", + "pelias-query": "8.10.0", "pelias-text-analyzer": "1.6.0", "stats-lite": "2.0.3", "through2": "2.0.1" From dd448355dc0808673fa012127f8157c759f50cd2 Mon Sep 17 00:00:00 2001 From: Stephen Hess Date: Tue, 29 Nov 2016 11:47:16 -0500 Subject: [PATCH 6/6] added comments, made things `const` --- sanitizer/_synthesize_analysis.js | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/sanitizer/_synthesize_analysis.js b/sanitizer/_synthesize_analysis.js index 79c53187..ab18b581 100644 --- a/sanitizer/_synthesize_analysis.js +++ b/sanitizer/_synthesize_analysis.js @@ -21,6 +21,10 @@ function isPostalCodeOnly(parsed_text) { parsed_text.hasOwnProperty('postalcode'); } +// figure out which field contains the probable house number, prefer number +// libpostal parses some inputs, like `3370 cobbe ave`, as a postcode+street +// so because we're treating the entire field as a street address, it's safe +// to assume that an identified postcode is actually a house number. function getHouseNumberField(analyzed_address) { for (var field of ['number', 'postalcode']) { if (analyzed_address.hasOwnProperty(field)) { @@ -54,21 +58,31 @@ function sanitize( raw, clean ){ } if (clean.parsed_text.hasOwnProperty('address')) { - var analyzed_address = text_analyzer.parse(clean.parsed_text.address); + const analyzed_address = text_analyzer.parse(clean.parsed_text.address); const house_number_field = getHouseNumberField(analyzed_address); + // if we're fairly certain that libpostal identified a house number + // (from either the house_number or postcode field), place it into the + // number field and remove the first instance of that value from address + // and assign to street + // eg - '1090 N Charlotte St' becomes number=1090 and street=N Charlotte St if (house_number_field) { clean.parsed_text.number = analyzed_address[house_number_field]; + // remove the first instance of the number and trim whitespace clean.parsed_text.street = _.trim(_.replace(clean.parsed_text.address, clean.parsed_text.number, '')); - delete clean.parsed_text.address; } else { + // otherwise no house number was identifiable, so treat the entire input + // as a street clean.parsed_text.street = clean.parsed_text.address; - delete clean.parsed_text.address; + } + // the address field no longer means anything since it's been parsed, so remove it + delete clean.parsed_text.address; + } return messages;