From aa3e764e49a76cb6d8f9fb2466aee42bf7e98b8f Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Tue, 29 Mar 2016 14:41:44 +0200 Subject: [PATCH 01/24] update analyzers to work with https://github.com/pelias/schema/pull/109 --- query/autocomplete_defaults.js | 6 +++--- query/reverse_defaults.js | 2 +- query/search_defaults.js | 2 +- .../autocomplete_linguistic_final_token.js | 6 +++--- .../fixture/autocomplete_linguistic_focus.js | 8 ++++---- ...utocomplete_linguistic_focus_null_island.js | 8 ++++---- .../autocomplete_linguistic_multiple_tokens.js | 10 +++++----- .../fixture/autocomplete_linguistic_only.js | 6 +++--- .../autocomplete_linguistic_with_admin.js | 8 ++++---- test/unit/fixture/search_boundary_country.js | 2 +- test/unit/fixture/search_full_address.js | 2 +- test/unit/fixture/search_linguistic_bbox.js | 2 +- test/unit/fixture/search_linguistic_focus.js | 2 +- .../fixture/search_linguistic_focus_bbox.js | 2 +- .../search_linguistic_focus_null_island.js | 2 +- test/unit/fixture/search_linguistic_only.js | 2 +- .../unit/fixture/search_linguistic_viewport.js | 2 +- .../search_linguistic_viewport_min_diagonal.js | 2 +- test/unit/fixture/search_partial_address.js | 2 +- test/unit/fixture/search_regions_address.js | 2 +- test/unit/query/autocomplete.js | 18 +++++++++--------- 21 files changed, 48 insertions(+), 48 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index ba52a049..da0791ef 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -20,12 +20,12 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasPhrase', + 'ngram:analyzer': 'peliasQueryPartialToken', 'ngram:field': 'name.default', 'ngram:boost': 100, - 'phrase:analyzer': 'peliasPhrase', - 'phrase:field': 'phrase.default', + 'phrase:analyzer': 'peliasQueryFullToken', + 'phrase:field': 'name.default', 'phrase:boost': 1, 'phrase:slop': 2, diff --git a/query/reverse_defaults.js b/query/reverse_defaults.js index 306efaac..06ad6400 100644 --- a/query/reverse_defaults.js +++ b/query/reverse_defaults.js @@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasOneEdgeGram', + 'ngram:analyzer': 'peliasQueryPartialToken', 'ngram:field': 'name.default', 'ngram:boost': 1, diff --git a/query/search_defaults.js b/query/search_defaults.js index ea0dc87f..3c26f4dc 100644 --- a/query/search_defaults.js +++ b/query/search_defaults.js @@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasOneEdgeGram', + 'ngram:analyzer': 'peliasQueryPartialToken', 'ngram:field': 'name.default', 'ngram:boost': 1, diff --git a/test/unit/fixture/autocomplete_linguistic_final_token.js b/test/unit/fixture/autocomplete_linguistic_final_token.js index fbe80052..e100206b 100644 --- a/test/unit/fixture/autocomplete_linguistic_final_token.js +++ b/test/unit/fixture/autocomplete_linguistic_final_token.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one', 'type': 'phrase', @@ -45,7 +45,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one', 'type': 'phrase', diff --git a/test/unit/fixture/autocomplete_linguistic_focus.js b/test/unit/fixture/autocomplete_linguistic_focus.js index 4f722b84..bcb18d0d 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus.js +++ b/test/unit/fixture/autocomplete_linguistic_focus.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -64,7 +64,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -89,7 +89,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', diff --git a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js index d9c04fd1..65a3146d 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js +++ b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -64,7 +64,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -89,7 +89,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index 9018fdab..db6c4fc4 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -6,8 +6,8 @@ module.exports = { 'bool': { 'must': [{ 'match': { - 'phrase.default': { - 'analyzer': 'peliasPhrase', + 'name.default': { + 'analyzer': 'peliasQueryFullToken', 'type': 'phrase', 'boost': 1, 'slop': 2, @@ -18,7 +18,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'three', 'type': 'phrase', @@ -31,7 +31,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one two three', 'type': 'phrase', @@ -56,7 +56,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one two three', 'type': 'phrase', diff --git a/test/unit/fixture/autocomplete_linguistic_only.js b/test/unit/fixture/autocomplete_linguistic_only.js index 24b89ad9..e4fe20ee 100644 --- a/test/unit/fixture/autocomplete_linguistic_only.js +++ b/test/unit/fixture/autocomplete_linguistic_only.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -45,7 +45,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 245f6451..59e77c0c 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -7,8 +7,8 @@ module.exports = { 'must': [ { 'match': { - 'phrase.default': { - 'analyzer': 'peliasPhrase', + 'name.default': { + 'analyzer': 'peliasQueryFullToken', 'type': 'phrase', 'boost': 1, 'slop': 2, @@ -86,7 +86,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one two', 'type': 'phrase', @@ -114,7 +114,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one two', 'type': 'phrase', diff --git a/test/unit/fixture/search_boundary_country.js b/test/unit/fixture/search_boundary_country.js index 4bf45315..b84dd0e9 100644 --- a/test/unit/fixture/search_boundary_country.js +++ b/test/unit/fixture/search_boundary_country.js @@ -18,7 +18,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } } diff --git a/test/unit/fixture/search_full_address.js b/test/unit/fixture/search_full_address.js index 8a8290ab..570e5eca 100644 --- a/test/unit/fixture/search_full_address.js +++ b/test/unit/fixture/search_full_address.js @@ -9,7 +9,7 @@ module.exports = { 'match': { 'name.default': { 'query': '123 main st', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasQueryPartialToken', 'boost': 1 } } diff --git a/test/unit/fixture/search_linguistic_bbox.js b/test/unit/fixture/search_linguistic_bbox.js index 5bb5907c..e9368bd5 100644 --- a/test/unit/fixture/search_linguistic_bbox.js +++ b/test/unit/fixture/search_linguistic_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } }], diff --git a/test/unit/fixture/search_linguistic_focus.js b/test/unit/fixture/search_linguistic_focus.js index 5d03d66d..c495243a 100644 --- a/test/unit/fixture/search_linguistic_focus.js +++ b/test/unit/fixture/search_linguistic_focus.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_bbox.js b/test/unit/fixture/search_linguistic_focus_bbox.js index 96fe92f6..365b37d8 100644 --- a/test/unit/fixture/search_linguistic_focus_bbox.js +++ b/test/unit/fixture/search_linguistic_focus_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_null_island.js b/test/unit/fixture/search_linguistic_focus_null_island.js index 0924475d..a9e49a06 100644 --- a/test/unit/fixture/search_linguistic_focus_null_island.js +++ b/test/unit/fixture/search_linguistic_focus_null_island.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } }], diff --git a/test/unit/fixture/search_linguistic_only.js b/test/unit/fixture/search_linguistic_only.js index 58c05826..bf1056f9 100644 --- a/test/unit/fixture/search_linguistic_only.js +++ b/test/unit/fixture/search_linguistic_only.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } }], diff --git a/test/unit/fixture/search_linguistic_viewport.js b/test/unit/fixture/search_linguistic_viewport.js index be76ab05..bcd39af2 100644 --- a/test/unit/fixture/search_linguistic_viewport.js +++ b/test/unit/fixture/search_linguistic_viewport.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasQueryPartialToken', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js index cf44d0d8..2d1d3e2d 100644 --- a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js +++ b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasQueryPartialToken', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_partial_address.js b/test/unit/fixture/search_partial_address.js index 6c4174b6..aa45ca68 100644 --- a/test/unit/fixture/search_partial_address.js +++ b/test/unit/fixture/search_partial_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': 'soho grand', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasQueryPartialToken', 'boost': 1 } } diff --git a/test/unit/fixture/search_regions_address.js b/test/unit/fixture/search_regions_address.js index e0c05f3c..82127051 100644 --- a/test/unit/fixture/search_regions_address.js +++ b/test/unit/fixture/search_regions_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': '1 water st', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasQueryPartialToken', 'boost': 1 } } diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index dc973ddc..5ea2182d 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -20,7 +20,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_only'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_only'); t.end(); }); @@ -30,9 +30,9 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_multiple_tokens.js'); + var expected = require('../fixture/autocomplete_linguistic_multiple_tokens'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_multiple_tokens'); t.end(); }); @@ -47,9 +47,9 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_with_admin.js'); + var expected = require('../fixture/autocomplete_linguistic_with_admin'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_with_admin'); t.end(); }); @@ -62,9 +62,9 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_final_token.js'); + var expected = require('../fixture/autocomplete_linguistic_final_token'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_final_token'); t.end(); }); @@ -78,7 +78,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_focus'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_focus'); t.end(); }); @@ -92,7 +92,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_focus_null_island'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_focus_null_island'); t.end(); }); }; From 3a789b4a933e4636bd47206ca9c96412d37104b4 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 7 Apr 2016 12:39:18 +0200 Subject: [PATCH 02/24] increase autocomplete 'phrase:slop' setting from 2->3 --- query/autocomplete_defaults.js | 2 +- test/unit/fixture/autocomplete_linguistic_multiple_tokens.js | 2 +- test/unit/fixture/autocomplete_linguistic_with_admin.js | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index ba52a049..cd45b62d 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -27,7 +27,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'phrase:analyzer': 'peliasPhrase', 'phrase:field': 'phrase.default', 'phrase:boost': 1, - 'phrase:slop': 2, + 'phrase:slop': 3, 'focus:function': 'linear', 'focus:offset': '0km', diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index 9018fdab..eaf01ee6 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -10,7 +10,7 @@ module.exports = { 'analyzer': 'peliasPhrase', 'type': 'phrase', 'boost': 1, - 'slop': 2, + 'slop': 3, 'query': 'one two' } } diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 245f6451..8f2edc44 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -11,7 +11,7 @@ module.exports = { 'analyzer': 'peliasPhrase', 'type': 'phrase', 'boost': 1, - 'slop': 2, + 'slop': 3, 'query': 'one two' } } From fd529ccee9ae95073df92d3850a289b55169252a Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Thu, 14 Apr 2016 15:50:48 -0400 Subject: [PATCH 03/24] Use new express sendStatus method According to a message on the console: ``` express deprecated res.send(status): Use res.sendStatus(status) instead middleware/options.js:12:9 ``` --- middleware/options.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/middleware/options.js b/middleware/options.js index 57c0e1eb..f6ca8767 100644 --- a/middleware/options.js +++ b/middleware/options.js @@ -9,10 +9,10 @@ function middleware(req, res, next){ if( req.method === 'OPTIONS' ){ - res.send(200); + res.sendStatus(200); } else { next(); } } -module.exports = middleware; \ No newline at end of file +module.exports = middleware; From e40c9ef32623f78e8a3405779eba1e84287d8091 Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 15 Apr 2016 15:21:53 +0200 Subject: [PATCH 04/24] increase focus weight from 10->40 and simplify population/popularity subview --- query/autocomplete.js | 7 ++++--- query/autocomplete_defaults.js | 2 +- query/view/pop_subquery.js | 16 ++++++++++++++++ .../autocomplete_linguistic_final_token.js | 10 ++-------- .../fixture/autocomplete_linguistic_focus.js | 12 +++--------- .../autocomplete_linguistic_focus_null_island.js | 12 +++--------- .../autocomplete_linguistic_multiple_tokens.js | 10 ++-------- .../unit/fixture/autocomplete_linguistic_only.js | 10 ++-------- .../autocomplete_linguistic_with_admin.js | 10 ++-------- .../autocomplete_with_source_filtering.js | 10 ++-------- 10 files changed, 37 insertions(+), 62 deletions(-) create mode 100644 query/view/pop_subquery.js diff --git a/query/autocomplete.js b/query/autocomplete.js index ffc57396..0416163d 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -9,7 +9,8 @@ var views = { ngrams_strict: require('./view/ngrams_strict'), focus_selected_layers: require('./view/focus_selected_layers'), ngrams_last_token_only: require('./view/ngrams_last_token_only'), - phrase_first_tokens_only: require('./view/phrase_first_tokens_only') + phrase_first_tokens_only: require('./view/phrase_first_tokens_only'), + pop_subquery: require('./view/pop_subquery') }; //------------------------------ @@ -38,8 +39,8 @@ query.score( peliasQuery.view.admin('neighbourhood') ); // scoring boost query.score( views.focus_selected_layers( views.ngrams_strict ) ); -query.score( peliasQuery.view.popularity( views.ngrams_strict ) ); -query.score( peliasQuery.view.population( views.ngrams_strict ) ); +query.score( peliasQuery.view.popularity( views.pop_subquery ) ); +query.score( peliasQuery.view.population( views.pop_subquery ) ); // non-scoring hard filters query.filter( peliasQuery.view.sources ); diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index da0791ef..cacc8297 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -33,7 +33,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'focus:offset': '0km', 'focus:scale': '250km', 'focus:decay': 0.5, - 'focus:weight': 10, + 'focus:weight': 40, 'function_score:score_mode': 'avg', 'function_score:boost_mode': 'multiply', diff --git a/query/view/pop_subquery.js b/query/view/pop_subquery.js new file mode 100644 index 00000000..bde1492b --- /dev/null +++ b/query/view/pop_subquery.js @@ -0,0 +1,16 @@ + +var peliasQuery = require('pelias-query'); + +/** + Population / Popularity subquery +**/ + +module.exports = function( vs ){ + + var view = peliasQuery.view.ngrams( vs ); + + view.match['name.default'].analyzer = 'peliasQueryFullToken'; + delete view.match['name.default'].boost; + + return view; +}; diff --git a/test/unit/fixture/autocomplete_linguistic_final_token.js b/test/unit/fixture/autocomplete_linguistic_final_token.js index e100206b..fc431c77 100644 --- a/test/unit/fixture/autocomplete_linguistic_final_token.js +++ b/test/unit/fixture/autocomplete_linguistic_final_token.js @@ -20,11 +20,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +42,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_focus.js b/test/unit/fixture/autocomplete_linguistic_focus.js index bcb18d0d..430d43c9 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus.js +++ b/test/unit/fixture/autocomplete_linguistic_focus.js @@ -40,7 +40,7 @@ module.exports = { 'decay': 0.5 } }, - 'weight': 10 + 'weight': 40 }], 'score_mode': 'avg', 'boost_mode': 'multiply', @@ -64,11 +64,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -89,11 +86,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js index 65a3146d..9a4afc05 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js +++ b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js @@ -40,7 +40,7 @@ module.exports = { 'decay': 0.5 } }, - 'weight': 10 + 'weight': 40 }], 'score_mode': 'avg', 'boost_mode': 'multiply', @@ -64,11 +64,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -89,11 +86,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index db6c4fc4..26e5fa86 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -31,11 +31,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two three', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -56,11 +53,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two three', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_only.js b/test/unit/fixture/autocomplete_linguistic_only.js index e4fe20ee..4360f4d7 100644 --- a/test/unit/fixture/autocomplete_linguistic_only.js +++ b/test/unit/fixture/autocomplete_linguistic_only.js @@ -20,11 +20,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +42,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 59e77c0c..b183bf77 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -86,11 +86,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -114,11 +111,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_with_source_filtering.js b/test/unit/fixture/autocomplete_with_source_filtering.js index 5f3bcf07..075eb6d4 100644 --- a/test/unit/fixture/autocomplete_with_source_filtering.js +++ b/test/unit/fixture/autocomplete_with_source_filtering.js @@ -20,11 +20,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +42,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, From 7876650581922c3c6b24794b2c940afcc5cd9619 Mon Sep 17 00:00:00 2001 From: greenkeeperio-bot Date: Fri, 15 Apr 2016 14:28:04 -0700 Subject: [PATCH 05/24] chore(package): update pelias-model to version 4.0.0 http://greenkeeper.io/ --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 0c0cc1f7..ed536057 100644 --- a/package.json +++ b/package.json @@ -53,7 +53,7 @@ "morgan": "1.7.0", "pelias-config": "^1.0.1", "pelias-logger": "^0.0.8", - "pelias-model": "^3.1.0", + "pelias-model": "^4.0.0", "pelias-query": "6.2.0", "pelias-suggester-pipeline": "2.0.4", "stats-lite": "1.0.3", From 2c4082770153c442585272a63b25ba547945c3fd Mon Sep 17 00:00:00 2001 From: Stephen Hess Date: Fri, 15 Apr 2016 20:42:38 -0400 Subject: [PATCH 06/24] added `layer=borough` support for *all 5 boroughs* in data --- helper/type_mapping.js | 3 ++- test/ciao/autocomplete/layers_alias_coarse.coffee | 1 + test/ciao/autocomplete/layers_invalid.coffee | 2 +- test/ciao/autocomplete/layers_mix_invalid_valid.coffee | 2 +- test/ciao/reverse/layers_alias_coarse.coffee | 1 + test/ciao/reverse/layers_invalid.coffee | 2 +- test/ciao/reverse/layers_mix_invalid_valid.coffee | 2 +- test/ciao/search/layers_alias_coarse.coffee | 1 + test/ciao/search/layers_invalid.coffee | 2 +- test/ciao/search/layers_mix_invalid_valid.coffee | 2 +- test/unit/helper/type_mapping.js | 2 +- test/unit/sanitiser/_layers.js | 10 +++++----- 12 files changed, 17 insertions(+), 13 deletions(-) diff --git a/helper/type_mapping.js b/helper/type_mapping.js index 0b20c111..ed20c0d8 100644 --- a/helper/type_mapping.js +++ b/helper/type_mapping.js @@ -49,7 +49,8 @@ var LAYERS_BY_SOURCE = { openaddresses: [ 'address' ], geonames: [ 'country', 'region', 'county', 'locality', 'venue' ], whosonfirst: [ 'continent', 'country', 'dependency', 'macroregion', 'region', - 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'neighbourhood', 'microhood', 'disputed'] + 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'borough', + 'neighbourhood', 'microhood', 'disputed'] }; /* diff --git a/test/ciao/autocomplete/layers_alias_coarse.coffee b/test/ciao/autocomplete/layers_alias_coarse.coffee index 2fa2265c..3db308be 100644 --- a/test/ciao/autocomplete/layers_alias_coarse.coffee +++ b/test/ciao/autocomplete/layers_alias_coarse.coffee @@ -41,6 +41,7 @@ json.geocoding.query.layers.should.eql [ "continent", "macrocounty", "county", "macrohood", + "borough", "neighbourhood", "microhood", "disputed" diff --git a/test/ciao/autocomplete/layers_invalid.coffee b/test/ciao/autocomplete/layers_invalid.coffee index 620b5586..6f3cebe0 100644 --- a/test/ciao/autocomplete/layers_invalid.coffee +++ b/test/ciao/autocomplete/layers_invalid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/ciao/autocomplete/layers_mix_invalid_valid.coffee b/test/ciao/autocomplete/layers_mix_invalid_valid.coffee index 963b79ab..a819dd44 100644 --- a/test/ciao/autocomplete/layers_mix_invalid_valid.coffee +++ b/test/ciao/autocomplete/layers_mix_invalid_valid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/ciao/reverse/layers_alias_coarse.coffee b/test/ciao/reverse/layers_alias_coarse.coffee index 09c91483..40ce2e37 100644 --- a/test/ciao/reverse/layers_alias_coarse.coffee +++ b/test/ciao/reverse/layers_alias_coarse.coffee @@ -40,6 +40,7 @@ json.geocoding.query.layers.should.eql [ "continent", "macrocounty", "county", "macrohood", + "borough", "neighbourhood", "microhood", "disputed" diff --git a/test/ciao/reverse/layers_invalid.coffee b/test/ciao/reverse/layers_invalid.coffee index aaec4864..bc57a3b3 100644 --- a/test/ciao/reverse/layers_invalid.coffee +++ b/test/ciao/reverse/layers_invalid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/ciao/reverse/layers_mix_invalid_valid.coffee b/test/ciao/reverse/layers_mix_invalid_valid.coffee index 307b225d..16f40b9e 100644 --- a/test/ciao/reverse/layers_mix_invalid_valid.coffee +++ b/test/ciao/reverse/layers_mix_invalid_valid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/ciao/search/layers_alias_coarse.coffee b/test/ciao/search/layers_alias_coarse.coffee index bf7cdb52..48723853 100644 --- a/test/ciao/search/layers_alias_coarse.coffee +++ b/test/ciao/search/layers_alias_coarse.coffee @@ -41,6 +41,7 @@ json.geocoding.query.layers.should.eql [ "continent", "macrocounty", "county", "macrohood", + "borough", "neighbourhood", "microhood", "disputed" diff --git a/test/ciao/search/layers_invalid.coffee b/test/ciao/search/layers_invalid.coffee index 4f2da456..cc6feab8 100644 --- a/test/ciao/search/layers_invalid.coffee +++ b/test/ciao/search/layers_invalid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/ciao/search/layers_mix_invalid_valid.coffee b/test/ciao/search/layers_mix_invalid_valid.coffee index f004c69e..f332e051 100644 --- a/test/ciao/search/layers_mix_invalid_valid.coffee +++ b/test/ciao/search/layers_mix_invalid_valid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/unit/helper/type_mapping.js b/test/unit/helper/type_mapping.js index 355fd4e6..a9ec4721 100644 --- a/test/unit/helper/type_mapping.js +++ b/test/unit/helper/type_mapping.js @@ -14,7 +14,7 @@ module.exports.tests.interfaces = function(test, common) { t.deepEquals(type_mapping.layer_mapping.coarse, [ 'continent', 'country', 'dependency', 'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', - 'neighbourhood', 'microhood', 'disputed' ]); + 'borough', 'neighbourhood', 'microhood', 'disputed' ]); t.end(); }); diff --git a/test/unit/sanitiser/_layers.js b/test/unit/sanitiser/_layers.js index b9dcbd0f..a1fde0f4 100644 --- a/test/unit/sanitiser/_layers.js +++ b/test/unit/sanitiser/_layers.js @@ -42,8 +42,8 @@ module.exports.tests.sanitize_layers = function(test, common) { sanitize(raw, clean); var admin_layers = [ 'continent', 'country', 'dependency', - 'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'neighbourhood', - 'microhood', 'disputed' ]; + 'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', + 'macrohood', 'borough', 'neighbourhood', 'microhood', 'disputed' ]; t.deepEqual(clean.layers, admin_layers, 'coarse layers set'); t.end(); @@ -77,8 +77,8 @@ module.exports.tests.sanitize_layers = function(test, common) { sanitize(raw, clean); var expected_layers = [ 'continent', 'country', 'dependency', - 'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'neighbourhood', - 'microhood', 'disputed' ]; + 'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', + 'macrohood', 'borough', 'neighbourhood', 'microhood', 'disputed' ]; t.deepEqual(clean.layers, expected_layers, 'coarse + regular layers set'); t.end(); @@ -114,7 +114,7 @@ module.exports.tests.sanitize_layers = function(test, common) { var coarse_layers = [ 'continent', 'country', 'dependency', 'macroregion', 'region', 'locality', 'localadmin', - 'macrocounty', 'county', 'macrohood', 'neighbourhood', 'microhood', + 'macrocounty', 'county', 'macrohood', 'borough', 'neighbourhood', 'microhood', 'disputed' ]; var venue_layers = [ 'venue' ]; From 9cdfce13c18955aab771840fee524d3e2ff64c2a Mon Sep 17 00:00:00 2001 From: Stephen Hess Date: Mon, 18 Apr 2016 15:17:08 -0400 Subject: [PATCH 07/24] fixed extraneous comma labels for results with only a `name` --- helper/labelGenerator.js | 27 ++++++++++++++------- test/unit/helper/labelGenerator_examples.js | 11 +++++++++ 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/helper/labelGenerator.js b/helper/labelGenerator.js index 20c73a98..97993e03 100644 --- a/helper/labelGenerator.js +++ b/helper/labelGenerator.js @@ -16,18 +16,27 @@ module.exports = function( record ){ // retain only things that are truthy labelParts = _.compact(labelParts); - // first, dedupe the name and 1st label array elements - // this is used to ensure that the `name` and first admin hierarchy elements aren't repeated - // eg - `["Lancaster", "Lancaster", "PA", "United States"]` -> `["Lancaster", "PA", "United States"]` - var dedupedNameAndFirstLabelElement = _.uniq([labelParts.shift(), labelParts.shift()]); + // third, dedupe and join with a comma and return + return dedupeNameAndFirstLabelElement(labelParts).join(', '); - // second, unshift the deduped parts back onto the labelParts - labelParts.unshift.apply(labelParts, dedupedNameAndFirstLabelElement); +}; - // third, join with a comma and return - return labelParts.join(', '); +function dedupeNameAndFirstLabelElement(labelParts) { + // only dedupe if a result has more than a name (the first label part) + if (labelParts.length > 1) { + // first, dedupe the name and 1st label array elements + // this is used to ensure that the `name` and first admin hierarchy elements aren't repeated + // eg - `["Lancaster", "Lancaster", "PA", "United States"]` -> `["Lancaster", "PA", "United States"]` + var deduped = _.uniq([labelParts.shift(), labelParts.shift()]); -}; + // second, unshift the deduped parts back onto the labelParts + labelParts.unshift.apply(labelParts, deduped); + + } + + return labelParts; + +} function getSchema(country_a) { if (country_a && country_a.length && schemas[country_a]) { diff --git a/test/unit/helper/labelGenerator_examples.js b/test/unit/helper/labelGenerator_examples.js index 416b7598..3207df86 100644 --- a/test/unit/helper/labelGenerator_examples.js +++ b/test/unit/helper/labelGenerator_examples.js @@ -104,6 +104,17 @@ module.exports.tests.france = function(test, common) { }; +module.exports.tests.name_only = function(test, common) { + test('name-only results (no admin fields) should not include extraneous comma', function(t) { + var doc = { + 'name': 'Result name', + }; + t.equal(generator(doc),'Result name'); + t.end(); + }); + +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { From bd3b8e7bbed7acdf1b4266cc122c66390e818b0b Mon Sep 17 00:00:00 2001 From: Stephen Hess Date: Tue, 19 Apr 2016 21:38:56 -0400 Subject: [PATCH 08/24] switched to pelias-text-analyzer package since that responsibility has been moved to there --- helper/text_parser.js | 86 ------------------ package.json | 2 +- sanitiser/_text.js | 4 +- test/unit/helper/text_parser.js | 150 -------------------------------- test/unit/query/autocomplete.js | 1 - test/unit/query/search.js | 8 +- test/unit/run.js | 1 - test/unit/sanitiser/search.js | 4 +- 8 files changed, 9 insertions(+), 247 deletions(-) delete mode 100644 helper/text_parser.js delete mode 100644 test/unit/helper/text_parser.js diff --git a/helper/text_parser.js b/helper/text_parser.js deleted file mode 100644 index 0db8bede..00000000 --- a/helper/text_parser.js +++ /dev/null @@ -1,86 +0,0 @@ - -var parser = require('addressit'); -var extend = require('extend'); -var type_mapping = require('../helper/type_mapping'); -var check = require('check-types'); -var logger = require('pelias-logger').get('api'); - -var DELIM = ','; - -/* - * For performance, and to prefer POI and admin records, express a preference - * to only search coarse layers on very short text inputs. - */ -module.exports.get_layers = function get_layers(query) { - if (query.length <= 3 ) { - // no address parsing required - return type_mapping.layer_mapping.coarse; - } -}; - -module.exports.get_parsed_address = function get_parsed_address(query) { - - var getAdminPartsBySplittingOnDelim = function(queryParts) { - // naive approach - for admin matching during query time - // split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny' - - var address = {}; - - if (queryParts.length > 1) { - address.name = queryParts[0].trim(); - - // 1. slice away all parts after the first one - // 2. trim spaces from each part just in case - // 3. join the parts back together with appropriate delimiter and spacing - address.admin_parts = queryParts.slice(1) - .map(function (part) { return part.trim(); }) - .join(DELIM + ' '); - } - - return address; - }; - - var getAddressParts = function(query) { - // perform full address parsing - // except on queries so short they obviously can't contain an address - if (query.length > 3) { - return parser( query ); - } - }; - - var queryParts = query.split(DELIM); - - var addressWithAdminParts = getAdminPartsBySplittingOnDelim(queryParts); - var addressWithAddressParts= getAddressParts(queryParts.join(DELIM + ' ')); - - var parsedAddress = extend(addressWithAdminParts, - addressWithAddressParts); - - var address_parts = [ 'name', - 'number', - 'street', - 'city', - 'state', - 'country', - 'postalcode', - 'regions', - 'admin_parts' - ]; - - var parsed_text = {}; - - address_parts.forEach(function(part){ - if (parsedAddress[part]) { - parsed_text[part] = parsedAddress[part]; - } - }); - - // if all we found was regions, ignore it as it is not enough information to make smarter decisions - if (Object.keys(parsed_text).length === 1 && !check.undefined(parsed_text.regions)) - { - logger.info('Ignoring address parser output, regions only'); - return null; - } - - return parsed_text; -}; diff --git a/package.json b/package.json index 0c0cc1f7..fc511657 100644 --- a/package.json +++ b/package.json @@ -35,7 +35,6 @@ "elasticsearch": ">=1.2.1" }, "dependencies": { - "addressit": "git://github.com/dianashk/addressit.git#temp", "async": "^1.5.2", "check-types": "^6.0.0", "cluster2": "git://github.com/missinglink/cluster2.git#node_zero_twelve", @@ -56,6 +55,7 @@ "pelias-model": "^3.1.0", "pelias-query": "6.2.0", "pelias-suggester-pipeline": "2.0.4", + "pelias-text-analyzer": "^1.0.1", "stats-lite": "1.0.3", "through2": "2.0.1" }, diff --git a/sanitiser/_text.js b/sanitiser/_text.js index e6897a5e..4709eeee 100644 --- a/sanitiser/_text.js +++ b/sanitiser/_text.js @@ -1,5 +1,5 @@ var check = require('check-types'), - text_parser = require('../helper/text_parser'); + text_analyzer = require('pelias-text-analyzer'); // validate texts, convert types and apply defaults function sanitize( raw, clean ){ @@ -19,7 +19,7 @@ function sanitize( raw, clean ){ clean.text = raw.text; // parse text with query parser - var parsed_text = text_parser.get_parsed_address(clean.text); + var parsed_text = text_analyzer.parse(clean.text); if (check.assigned(parsed_text)) { clean.parsed_text = parsed_text; } diff --git a/test/unit/helper/text_parser.js b/test/unit/helper/text_parser.js deleted file mode 100644 index ca5b05f0..00000000 --- a/test/unit/helper/text_parser.js +++ /dev/null @@ -1,150 +0,0 @@ -var parser = require('../../../helper/text_parser'); - -var type_mapping = require('../../../helper/type_mapping'); -var layers_map = type_mapping.layer_mapping; - -module.exports.tests = {}; - -module.exports.tests.interface = function(test, common) { - test('interface', function(t) { - t.equal(typeof parser.get_parsed_address, 'function', 'valid function'); - t.equal(typeof parser.get_layers, 'function', 'valid function'); - t.end(); - }); -}; - -module.exports.tests.split_on_comma = function(test, common) { - var queries = [ - { name: 'soho', admin_parts: 'new york' }, - { name: 'chelsea', admin_parts: 'london' }, - { name: '123 main', admin_parts: 'new york' } - ]; - - queries.forEach(function (query) { - test('naive parsing ' + query, function(t) { - var address = parser.get_parsed_address(query.name + ', ' + query.admin_parts); - - t.equal(typeof address, 'object', 'valid object'); - t.equal(address.name, query.name, 'name set correctly to ' + address.name); - t.equal(address.admin_parts, query.admin_parts, 'admin_parts set correctly to ' + address.admin_parts); - t.end(); - }); - - test('naive parsing ' + query + 'without spaces', function(t) { - var address = parser.get_parsed_address(query.name + ',' + query.admin_parts); - - t.equal(typeof address, 'object', 'valid object'); - t.equal(address.name, query.name, 'name set correctly to ' + address.name); - t.equal(address.admin_parts, query.admin_parts, 'admin_parts set correctly to ' + address.admin_parts); - t.end(); - }); - }); -}; - -module.exports.tests.parse_three_chars_or_less = function(test, common) { - var chars_queries = ['a', 'bb', 'ccc']; - var num_queries = ['1', '12', '123']; - var alphanum_q = ['a1', '1a2', '12c']; - - var queries = chars_queries.concat(num_queries).concat(alphanum_q); - queries.forEach(function(query) { - test('query length < 3 (' + query + ')', function(t) { - var address = parser.get_parsed_address(query); - var target_layer = layers_map.coarse; - var layers = parser.get_layers(query); - - t.equal(typeof address, 'object', 'valid object'); - t.deepEqual(layers, target_layer, 'admin_parts set correctly to ' + target_layer.join(', ')); - t.end(); - }); - }); -}; - -module.exports.tests.parse_one_token = function(test, common) { - test('query with one token', function (t) { - var address = parser.get_parsed_address('yugolsavia'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with two tokens, no numbers', function (t) { - var address = parser.get_parsed_address('small town'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with two tokens, number first', function (t) { - var address = parser.get_parsed_address('123 main'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with two tokens, number second', function (t) { - var address = parser.get_parsed_address('main 123'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with many tokens', function(t) { - var address = parser.get_parsed_address('main particle new york'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); -}; - -module.exports.tests.parse_address = function(test, common) { - test('valid address, house number', function(t) { - var query_string = '123 main st new york ny'; - var address = parser.get_parsed_address(query_string); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.number, '123', 'parsed house number'); - t.equal(address.street, 'main st', 'parsed street'); - t.deepEqual(address.regions, ['new york'], 'parsed city'); - t.equal(address.state , 'NY', 'parsed state'); - t.end(); - }); - test('valid address, zipcode', function(t) { - var query_string = '123 main st new york ny 10010'; - var address = parser.get_parsed_address(query_string); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.number, '123', 'parsed house number'); - t.equal(address.street, 'main st', 'parsed street'); - t.deepEqual(address.regions, ['new york'], 'parsed city'); - t.equal(address.state , 'NY', 'parsed state'); - t.equal(address.postalcode, '10010', 'parsed zip is a string'); - t.end(); - }); - test('valid address with leading 0s in zipcode', function(t) { - var query_string = '339 W Main St, Cheshire, 06410'; - var address = parser.get_parsed_address(query_string); - - console.log(address); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.street, 'W Main St', 'parsed street'); - t.deepEqual(address.regions, ['Cheshire'], 'parsed city'); - t.equal(address.postalcode, '06410', 'parsed zip'); - t.end(); - }); - test('valid address without spaces after commas', function(t) { - var query_string = '339 W Main St,Lancaster,PA'; - var address = parser.get_parsed_address(query_string); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.number, '339', 'parsed house number'); - t.equal(address.street, 'W Main St', 'parsed street'); - t.deepEqual(address.regions, ['Lancaster'], 'parsed city'); - t.deepEqual(address.state, 'PA', 'parsed state'); - t.end(); - }); -}; - - -module.exports.all = function (tape, common) { - - function test(name, testFunction) { - return tape('QUERY PARSING: ' + name, testFunction); - } - - for( var testCase in module.exports.tests ){ - module.exports.tests[testCase](test, common); - } -}; diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index 0e09457b..f4b44ffa 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -1,6 +1,5 @@ var generate = require('../../../query/autocomplete'); -var parser = require('../../../helper/text_parser'); module.exports.tests = {}; diff --git a/test/unit/query/search.js b/test/unit/query/search.js index e503311b..426eb2bc 100644 --- a/test/unit/query/search.js +++ b/test/unit/query/search.js @@ -1,5 +1,5 @@ var generate = require('../../../query/search'); -var parser = require('../../../helper/text_parser'); +var text_analyzer = require('pelias-text-analyzer'); module.exports.tests = {}; @@ -128,7 +128,7 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: address, layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, - parsed_text: parser.get_parsed_address(address), + parsed_text: text_analyzer.parse(address), }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -143,7 +143,7 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: partial_address, layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, - parsed_text: parser.get_parsed_address(partial_address), + parsed_text: text_analyzer.parse(partial_address), }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -158,7 +158,7 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: partial_address, layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, - parsed_text: parser.get_parsed_address(partial_address), + parsed_text: text_analyzer.parse(partial_address), }); var compiled = JSON.parse( JSON.stringify( query ) ); diff --git a/test/unit/run.js b/test/unit/run.js index 6f64f2e6..94d9ebb3 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -19,7 +19,6 @@ var tests = [ require('./helper/labelGenerator_GBR'), require('./helper/labelGenerator_USA'), require('./helper/labelSchema'), - require('./helper/text_parser'), require('./helper/type_mapping'), require('./helper/sizeCalculator'), require('./middleware/confidenceScore'), diff --git a/test/unit/sanitiser/search.js b/test/unit/sanitiser/search.js index e09672ce..864195a8 100644 --- a/test/unit/sanitiser/search.js +++ b/test/unit/sanitiser/search.js @@ -1,6 +1,6 @@ var extend = require('extend'), search = require('../../../sanitiser/search'), - parser = require('../../../helper/text_parser'), + text_analyzer = require('pelias-text-analyzer'), sanitize = search.sanitize, middleware = search.middleware, defaultError = 'invalid param \'text\': text length, must be >0'; @@ -80,7 +80,7 @@ module.exports.tests.sanitize_text_with_delim = function(test, common) { sanitize( req, function( ){ var expected_text = text; - var expected_parsed_text = parser.get_parsed_address(text); + var expected_parsed_text = text_analyzer.parse(text); t.equal(req.errors[0], undefined, 'no error'); t.equal(req.clean.parsed_text.name, expected_parsed_text.name, 'clean name set correctly'); t.equal(req.clean.text, expected_text, 'text should match'); From 25ab63c3b37c40c59fd309dda566562a2572da5f Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 21 Apr 2016 17:08:46 +0200 Subject: [PATCH 09/24] change search analyzer to be more similar to what we had before the autocomplete_refactor milestone --- query/search_defaults.js | 2 +- test/unit/fixture/search_boundary_country.js | 2 +- test/unit/fixture/search_full_address.js | 2 +- test/unit/fixture/search_linguistic_bbox.js | 2 +- test/unit/fixture/search_linguistic_focus.js | 2 +- test/unit/fixture/search_linguistic_focus_bbox.js | 2 +- test/unit/fixture/search_linguistic_focus_null_island.js | 2 +- test/unit/fixture/search_linguistic_only.js | 2 +- test/unit/fixture/search_linguistic_viewport.js | 2 +- test/unit/fixture/search_linguistic_viewport_min_diagonal.js | 2 +- test/unit/fixture/search_partial_address.js | 2 +- test/unit/fixture/search_regions_address.js | 2 +- test/unit/fixture/search_with_source_filtering.js | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/query/search_defaults.js b/query/search_defaults.js index 3c26f4dc..b0f8b119 100644 --- a/query/search_defaults.js +++ b/query/search_defaults.js @@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasQueryPartialToken', + 'ngram:analyzer': 'peliasIndexOneEdgeGram', 'ngram:field': 'name.default', 'ngram:boost': 1, diff --git a/test/unit/fixture/search_boundary_country.js b/test/unit/fixture/search_boundary_country.js index b84dd0e9..71965df4 100644 --- a/test/unit/fixture/search_boundary_country.js +++ b/test/unit/fixture/search_boundary_country.js @@ -18,7 +18,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } } diff --git a/test/unit/fixture/search_full_address.js b/test/unit/fixture/search_full_address.js index 570e5eca..172d439f 100644 --- a/test/unit/fixture/search_full_address.js +++ b/test/unit/fixture/search_full_address.js @@ -9,7 +9,7 @@ module.exports = { 'match': { 'name.default': { 'query': '123 main st', - 'analyzer': 'peliasQueryPartialToken', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } diff --git a/test/unit/fixture/search_linguistic_bbox.js b/test/unit/fixture/search_linguistic_bbox.js index e9368bd5..6afe7be6 100644 --- a/test/unit/fixture/search_linguistic_bbox.js +++ b/test/unit/fixture/search_linguistic_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus.js b/test/unit/fixture/search_linguistic_focus.js index c495243a..da3e8fb3 100644 --- a/test/unit/fixture/search_linguistic_focus.js +++ b/test/unit/fixture/search_linguistic_focus.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_bbox.js b/test/unit/fixture/search_linguistic_focus_bbox.js index 365b37d8..d5042c0f 100644 --- a/test/unit/fixture/search_linguistic_focus_bbox.js +++ b/test/unit/fixture/search_linguistic_focus_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_null_island.js b/test/unit/fixture/search_linguistic_focus_null_island.js index a9e49a06..b99febea 100644 --- a/test/unit/fixture/search_linguistic_focus_null_island.js +++ b/test/unit/fixture/search_linguistic_focus_null_island.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_only.js b/test/unit/fixture/search_linguistic_only.js index bf1056f9..a564a4c1 100644 --- a/test/unit/fixture/search_linguistic_only.js +++ b/test/unit/fixture/search_linguistic_only.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_viewport.js b/test/unit/fixture/search_linguistic_viewport.js index bcd39af2..b85d8322 100644 --- a/test/unit/fixture/search_linguistic_viewport.js +++ b/test/unit/fixture/search_linguistic_viewport.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js index 2d1d3e2d..e6b50ac6 100644 --- a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js +++ b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_partial_address.js b/test/unit/fixture/search_partial_address.js index aa45ca68..6810de54 100644 --- a/test/unit/fixture/search_partial_address.js +++ b/test/unit/fixture/search_partial_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': 'soho grand', - 'analyzer': 'peliasQueryPartialToken', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } diff --git a/test/unit/fixture/search_regions_address.js b/test/unit/fixture/search_regions_address.js index 82127051..bf3f3dce 100644 --- a/test/unit/fixture/search_regions_address.js +++ b/test/unit/fixture/search_regions_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': '1 water st', - 'analyzer': 'peliasQueryPartialToken', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } diff --git a/test/unit/fixture/search_with_source_filtering.js b/test/unit/fixture/search_with_source_filtering.js index 4aedeb04..18ee13a3 100644 --- a/test/unit/fixture/search_with_source_filtering.js +++ b/test/unit/fixture/search_with_source_filtering.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], From 028ab1d8898c29e2dbfb8c642cf1bf7431d9cd10 Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Thu, 21 Apr 2016 14:58:01 -0400 Subject: [PATCH 10/24] Add search config settings for borough --- query/search_defaults.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/query/search_defaults.js b/query/search_defaults.js index ea0dc87f..89aca7d6 100644 --- a/query/search_defaults.js +++ b/query/search_defaults.js @@ -78,6 +78,10 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:locality:field': 'parent.locality', 'admin:locality:boost': 1, + 'admin:borough:analyzer': 'peliasAdmin', + 'admin:borough:field': 'parent.borough', + 'admin:borough:boost': 1, + 'admin:neighbourhood:analyzer': 'peliasAdmin', 'admin:neighbourhood:field': 'parent.neighbourhood', 'admin:neighbourhood:boost': 1, From f8af354e71f06cc5deb65e5285848d60f442db58 Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Thu, 21 Apr 2016 14:58:17 -0400 Subject: [PATCH 11/24] Base admin fields on placetypes --- query/text_parser.js | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/query/text_parser.js b/query/text_parser.js index 38fca48e..d19465eb 100644 --- a/query/text_parser.js +++ b/query/text_parser.js @@ -1,20 +1,15 @@ var logger = require('pelias-logger').get('api'); +var placeTypes = require('../helper/placeTypes'); /* This list should only contain admin fields we are comfortable matching in the case when we can't identify parts of an address. This shouldn't contain fields like country_a or postalcode because we should only try to match those when we're sure that's what they are. */ -var adminFields = [ - 'country', - 'region', +var adminFields = placeTypes.concat([ 'region_a', - 'county', - 'localadmin', - 'locality', - 'neighbourhood' -]; +]); /** @todo: refactor me @@ -101,4 +96,4 @@ function addParsedVariablesToQueryVariables( parsed_text, vs ){ } } -module.exports = addParsedVariablesToQueryVariables; \ No newline at end of file +module.exports = addParsedVariablesToQueryVariables; From bf3d931640d0747090257fdbf5793456acc2b26f Mon Sep 17 00:00:00 2001 From: greenkeeperio-bot Date: Thu, 21 Apr 2016 14:46:22 -0700 Subject: [PATCH 12/24] chore(package): update pelias-query to version 6.3.0 https://greenkeeper.io/ --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index a6e5c30d..8e325f5d 100644 --- a/package.json +++ b/package.json @@ -53,7 +53,7 @@ "pelias-config": "^1.0.1", "pelias-logger": "^0.0.8", "pelias-model": "^4.0.0", - "pelias-query": "6.2.0", + "pelias-query": "6.3.0", "pelias-suggester-pipeline": "2.0.4", "pelias-text-analyzer": "^1.0.1", "stats-lite": "1.0.3", From 01a3233a7ba78d42bba013bce92e952144f6d34c Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 25 Apr 2016 12:15:18 +0200 Subject: [PATCH 13/24] add a view to boost exact matches --- package.json | 2 +- query/autocomplete.js | 4 +- query/view/boost_exact_matches.js | 48 ++++++ ...autocomplete_linguistic_multiple_tokens.js | 14 +- .../autocomplete_linguistic_with_admin.js | 11 ++ .../autocomplete_single_character_street.js | 147 ++++++++++++++++++ test/unit/query/autocomplete.js | 17 ++ 7 files changed, 240 insertions(+), 3 deletions(-) create mode 100644 query/view/boost_exact_matches.js create mode 100644 test/unit/fixture/autocomplete_single_character_street.js diff --git a/package.json b/package.json index 8e325f5d..f70cb57b 100644 --- a/package.json +++ b/package.json @@ -68,7 +68,7 @@ "precommit-hook": "^3.0.0", "proxyquire": "^1.4.0", "tap-dot": "1.0.5", - "tape": "^4.4.0" + "tape": "^4.5.1" }, "pre-commit": [ "lint", diff --git a/query/autocomplete.js b/query/autocomplete.js index 0416163d..d64151ae 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -10,7 +10,8 @@ var views = { focus_selected_layers: require('./view/focus_selected_layers'), ngrams_last_token_only: require('./view/ngrams_last_token_only'), phrase_first_tokens_only: require('./view/phrase_first_tokens_only'), - pop_subquery: require('./view/pop_subquery') + pop_subquery: require('./view/pop_subquery'), + boost_exact_matches: require('./view/boost_exact_matches') }; //------------------------------ @@ -38,6 +39,7 @@ query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); // scoring boost +query.score( views.boost_exact_matches ); query.score( views.focus_selected_layers( views.ngrams_strict ) ); query.score( peliasQuery.view.popularity( views.pop_subquery ) ); query.score( peliasQuery.view.population( views.pop_subquery ) ); diff --git a/query/view/boost_exact_matches.js b/query/view/boost_exact_matches.js new file mode 100644 index 00000000..8cf575f4 --- /dev/null +++ b/query/view/boost_exact_matches.js @@ -0,0 +1,48 @@ + +var peliasQuery = require('pelias-query'), + searchDefaults = require('../search_defaults'); + +/** + This view (unfortunately) requires autocomplete to use the phrase.* index. + + ideally we wouldn't need to use this, but at time of writing we are unable + to distinguish between 'complete tokens' and 'grams' in the name.* index. + + this view was introduced in order to score exact matches higher than partial + matches, without it we find results such as "Clayton Avenue" appearing first + in the results list for the query "Clay Av". + + the view uses some of the values from the 'search_defaults.js' file to add an + additional 'SHOULD' condition which scores exact matches slighly higher + than partial matches. +**/ + +module.exports = function( vs ){ + + // make a copy of the variables so we don't interfere with the values + // passed to other views. + var vsCopy = new peliasQuery.Vars( vs.export() ); + + // copy phrase:* values from search defaults + vsCopy.var('phrase:analyzer').set(searchDefaults['phrase:analyzer']); + vsCopy.var('phrase:field').set(searchDefaults['phrase:field']); + + // split the 'input:name' on whitespace + var name = vs.var('input:name').get(), + tokens = name.split(' '); + + // if the query is incomplete then we need to remove + // the final (incomplete) token as it will not match + // tokens in the phrase.* index. + if( !vs.var('input:name:isComplete').get() ){ + tokens.pop(); + } + + // no valid tokens to use, fail now, don't render this view. + if( tokens.length < 1 ){ return null; } + + // set 'input:name' to be only the fully completed characters + vsCopy.var('input:name').set( tokens.join(' ') ); + + return peliasQuery.view.phrase( vsCopy ); +}; diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index 1c1b13c0..d0465b04 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -26,7 +26,19 @@ module.exports = { } } }], - 'should':[{ + 'should':[ + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'one two' + } + } + }, + { 'function_score': { 'query': { 'match': { diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index d1007343..e3a62df2 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -81,6 +81,17 @@ module.exports = { } } }, + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'one two' + } + } + }, { 'function_score': { 'query': { diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js new file mode 100644 index 00000000..e992cc58 --- /dev/null +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -0,0 +1,147 @@ + +module.exports = { + 'query': { + 'filtered': { + 'query': { + 'bool': { + 'must': [{ + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'type': 'phrase', + 'boost': 1, + 'slop': 3, + 'query': 'k road' + } + } + }], + 'should':[ + { + 'match': { + 'address_parts.street': { + 'query': 'k road', + 'boost': 5, + 'analyzer': 'peliasStreet' + } + } + }, { + 'match': { + 'parent.country': { + 'query': 'laird', + 'boost': 800, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.region': { + 'query': 'laird', + 'boost': 600, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.region_a': { + 'query': 'laird', + 'boost': 600, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.county': { + 'query': 'laird', + 'boost': 400, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.localadmin': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.locality': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.neighbourhood': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'k road' + } + } + }, + { + 'function_score': { + 'query': { + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'k road', + } + } + }, + 'max_boost': 20, + 'score_mode': 'first', + 'boost_mode': 'replace', + 'functions': [{ + 'field_value_factor': { + 'modifier': 'log1p', + 'field': 'popularity', + 'missing': 1 + }, + 'weight': 1 + }] + } + },{ + 'function_score': { + 'query': { + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'k road', + } + } + }, + 'max_boost': 20, + 'score_mode': 'first', + 'boost_mode': 'replace', + 'functions': [{ + 'field_value_factor': { + 'modifier': 'log1p', + 'field': 'population', + 'missing': 1 + }, + 'weight': 3 + }] + } + }] + } + } + } + }, + 'sort': [ '_score' ], + 'size': 20, + 'track_scores': true +}; diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index e08034a8..72cfb5f2 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -107,6 +107,23 @@ module.exports.tests.query = function(test, common) { t.deepEqual(compiled, expected, 'valid autocomplete query with source filtering'); t.end(); }); + + test('single character street address', function(t) { + var query = generate({ + text: 'k road, laird', + parsed_text: { + name: 'k road', + street: 'k road', + regions: [ 'laird' ] + } + }); + + var compiled = JSON.parse( JSON.stringify( query ) ); + var expected = require('../fixture/autocomplete_single_character_street'); + + t.deepEqual(compiled, expected, 'autocomplete_single_character_street'); + t.end(); + }); }; module.exports.all = function (tape, common) { From ca0c51b0fde45c3f863a516bc923bc122c68f7ee Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 25 Apr 2016 12:22:41 +0200 Subject: [PATCH 14/24] don't strip single digits from query --- query/autocomplete.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index d64151ae..fec0a80b 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -71,7 +71,8 @@ function generateQuery( clean ){ // - to a 2gram index when using 'type:phrase' or 'operator:and' will // - result in a complete failure of the query. // 2. trim leading and trailing whitespace. - var text = clean.text.replace(/( .$)/g,'').trim(); + // note: single digit grams are now being produced in the name.* index + var text = clean.text.replace(/( [^0-9]$)/g,'').trim(); // if the input parser has run and suggested a 'parsed_text.name' to use. if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ From b862fc88339d5680d1ee82c0cfeb8d8bee5a6c1a Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 25 Apr 2016 13:03:16 +0200 Subject: [PATCH 15/24] refactor pop_subquery to be config driven --- query/view/pop_subquery.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/query/view/pop_subquery.js b/query/view/pop_subquery.js index bde1492b..d18b9963 100644 --- a/query/view/pop_subquery.js +++ b/query/view/pop_subquery.js @@ -9,7 +9,7 @@ module.exports = function( vs ){ var view = peliasQuery.view.ngrams( vs ); - view.match['name.default'].analyzer = 'peliasQueryFullToken'; + view.match['name.default'].analyzer = vs.var('phrase:analyzer'); delete view.match['name.default'].boost; return view; From 2398f05f8d96c3003f16a2124ff89961869a834c Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 25 Apr 2016 13:33:08 +0200 Subject: [PATCH 16/24] fix borough matching for both autocomplete and search endpoints --- query/autocomplete.js | 1 + query/autocomplete_defaults.js | 4 ++++ query/search.js | 1 + query/text_parser.js | 1 + .../autocomplete_linguistic_with_admin.js | 9 +++++++++ .../autocomplete_single_character_street.js | 8 ++++++++ test/unit/fixture/search_full_address.js | 8 ++++++++ test/unit/fixture/search_partial_address.js | 8 ++++++++ test/unit/fixture/search_regions_address.js | 8 ++++++++ test/unit/query/search.js | 18 +++++++++--------- 10 files changed, 57 insertions(+), 9 deletions(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index fec0a80b..6da4d569 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -34,6 +34,7 @@ query.score( peliasQuery.view.admin('country_a') ); query.score( peliasQuery.view.admin('region') ); query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin('county') ); +query.score( peliasQuery.view.admin('borough') ); query.score( peliasQuery.view.admin('localadmin') ); query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index 9c432cb9..8f46ce8d 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -82,6 +82,10 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:neighbourhood:field': 'parent.neighbourhood', 'admin:neighbourhood:boost': 200, + 'admin:borough:analyzer': 'peliasAdmin', + 'admin:borough:field': 'parent.borough', + 'admin:borough:boost': 800, + 'popularity:field': 'popularity', 'popularity:modifier': 'log1p', 'popularity:max_boost': 20, diff --git a/query/search.js b/query/search.js index 9f0a792c..77fcb3f5 100644 --- a/query/search.js +++ b/query/search.js @@ -30,6 +30,7 @@ query.score( peliasQuery.view.admin('country_a') ); query.score( peliasQuery.view.admin('region') ); query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin('county') ); +query.score( peliasQuery.view.admin('borough') ); query.score( peliasQuery.view.admin('localadmin') ); query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); diff --git a/query/text_parser.js b/query/text_parser.js index d19465eb..914a7f43 100644 --- a/query/text_parser.js +++ b/query/text_parser.js @@ -9,6 +9,7 @@ or postalcode because we should only try to match those when we're sure that's w */ var adminFields = placeTypes.concat([ 'region_a', + 'borough' ]); /** diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index e3a62df2..a0b07025 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -54,6 +54,15 @@ module.exports = { } } }, + { + 'match': { + 'parent.borough': { + 'analyzer': 'peliasAdmin', + 'boost': 800, + 'query': 'three' + } + } + }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index e992cc58..77264f8e 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -56,6 +56,14 @@ module.exports = { 'analyzer': 'peliasAdmin' } } + }, { + 'match': { + 'parent.borough': { + 'analyzer': 'peliasAdmin', + 'boost': 800, + 'query': 'laird' + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_full_address.js b/test/unit/fixture/search_full_address.js index 172d439f..dfd64e34 100644 --- a/test/unit/fixture/search_full_address.js +++ b/test/unit/fixture/search_full_address.js @@ -139,6 +139,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'new york', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_partial_address.js b/test/unit/fixture/search_partial_address.js index 6810de54..746899b7 100644 --- a/test/unit/fixture/search_partial_address.js +++ b/test/unit/fixture/search_partial_address.js @@ -107,6 +107,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'new york', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_regions_address.js b/test/unit/fixture/search_regions_address.js index bf3f3dce..0a8b199d 100644 --- a/test/unit/fixture/search_regions_address.js +++ b/test/unit/fixture/search_regions_address.js @@ -123,6 +123,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'manhattan', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/query/search.js b/test/unit/query/search.js index 426eb2bc..a2bb8e2f 100644 --- a/test/unit/query/search.js +++ b/test/unit/query/search.js @@ -25,7 +25,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus_bbox'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus_bbox'); t.end(); }); @@ -42,7 +42,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_bbox'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_bbox'); t.end(); }); @@ -55,7 +55,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_only'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_only'); t.end(); }); @@ -69,7 +69,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus'); t.end(); }); @@ -86,7 +86,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_viewport'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_viewport'); t.end(); }); @@ -119,7 +119,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus_null_island'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus_null_island'); t.end(); }); @@ -134,7 +134,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_full_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_full_address'); t.end(); }); @@ -149,7 +149,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_partial_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_partial_address'); t.end(); }); @@ -164,7 +164,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_regions_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_regions_address'); t.end(); }); From da4c66653871c2ee5b8d82783efbfbf16fc629fc Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 28 Apr 2016 14:34:55 +0200 Subject: [PATCH 17/24] reduce admin:borough:boost from 800->600 --- query/autocomplete_defaults.js | 2 +- test/unit/fixture/autocomplete_linguistic_with_admin.js | 2 +- test/unit/fixture/autocomplete_single_character_street.js | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index 8f46ce8d..08e33aeb 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -84,7 +84,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:borough:analyzer': 'peliasAdmin', 'admin:borough:field': 'parent.borough', - 'admin:borough:boost': 800, + 'admin:borough:boost': 600, 'popularity:field': 'popularity', 'popularity:modifier': 'log1p', diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index a0b07025..7cb51eea 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -58,7 +58,7 @@ module.exports = { 'match': { 'parent.borough': { 'analyzer': 'peliasAdmin', - 'boost': 800, + 'boost': 600, 'query': 'three' } } diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 77264f8e..f89e8493 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -60,7 +60,7 @@ module.exports = { 'match': { 'parent.borough': { 'analyzer': 'peliasAdmin', - 'boost': 800, + 'boost': 600, 'query': 'laird' } } From 9dbed08884897cfd7ac8ec4ba9715ba1dd3170ff Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 28 Apr 2016 14:36:29 +0200 Subject: [PATCH 18/24] remove duplicate entry for borough --- query/text_parser.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/query/text_parser.js b/query/text_parser.js index 914a7f43..00e60724 100644 --- a/query/text_parser.js +++ b/query/text_parser.js @@ -8,8 +8,7 @@ when we can't identify parts of an address. This shouldn't contain fields like c or postalcode because we should only try to match those when we're sure that's what they are. */ var adminFields = placeTypes.concat([ - 'region_a', - 'borough' + 'region_a' ]); /** From 0c67347c273deaa7cfacc5a0f1e91c7877a7f669 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 28 Apr 2016 14:41:57 +0200 Subject: [PATCH 19/24] enable borough for search --- query/search.js | 1 + test/unit/fixture/search_full_address.js | 8 ++++++++ test/unit/fixture/search_partial_address.js | 8 ++++++++ test/unit/fixture/search_regions_address.js | 8 ++++++++ 4 files changed, 25 insertions(+) diff --git a/query/search.js b/query/search.js index 9f0a792c..77fcb3f5 100644 --- a/query/search.js +++ b/query/search.js @@ -30,6 +30,7 @@ query.score( peliasQuery.view.admin('country_a') ); query.score( peliasQuery.view.admin('region') ); query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin('county') ); +query.score( peliasQuery.view.admin('borough') ); query.score( peliasQuery.view.admin('localadmin') ); query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); diff --git a/test/unit/fixture/search_full_address.js b/test/unit/fixture/search_full_address.js index 8a8290ab..bef0a7b0 100644 --- a/test/unit/fixture/search_full_address.js +++ b/test/unit/fixture/search_full_address.js @@ -139,6 +139,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'new york', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_partial_address.js b/test/unit/fixture/search_partial_address.js index 6c4174b6..290d28e5 100644 --- a/test/unit/fixture/search_partial_address.js +++ b/test/unit/fixture/search_partial_address.js @@ -107,6 +107,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'new york', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_regions_address.js b/test/unit/fixture/search_regions_address.js index e0c05f3c..343dfc43 100644 --- a/test/unit/fixture/search_regions_address.js +++ b/test/unit/fixture/search_regions_address.js @@ -123,6 +123,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'manhattan', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { From e093a09a8d6a78656ea5a9920f5a0383a1b3d630 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 28 Apr 2016 14:47:06 +0200 Subject: [PATCH 20/24] remove search related improvements from this PR --- query/search.js | 1 - test/unit/fixture/search_full_address.js | 8 -------- test/unit/fixture/search_partial_address.js | 8 -------- test/unit/fixture/search_regions_address.js | 8 -------- 4 files changed, 25 deletions(-) diff --git a/query/search.js b/query/search.js index 77fcb3f5..9f0a792c 100644 --- a/query/search.js +++ b/query/search.js @@ -30,7 +30,6 @@ query.score( peliasQuery.view.admin('country_a') ); query.score( peliasQuery.view.admin('region') ); query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin('county') ); -query.score( peliasQuery.view.admin('borough') ); query.score( peliasQuery.view.admin('localadmin') ); query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); diff --git a/test/unit/fixture/search_full_address.js b/test/unit/fixture/search_full_address.js index dfd64e34..172d439f 100644 --- a/test/unit/fixture/search_full_address.js +++ b/test/unit/fixture/search_full_address.js @@ -139,14 +139,6 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } - }, { - 'match': { - 'parent.borough': { - 'query': 'new york', - 'boost': vs['admin:borough:boost'], - 'analyzer': vs['admin:borough:analyzer'] - } - } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_partial_address.js b/test/unit/fixture/search_partial_address.js index 746899b7..6810de54 100644 --- a/test/unit/fixture/search_partial_address.js +++ b/test/unit/fixture/search_partial_address.js @@ -107,14 +107,6 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } - }, { - 'match': { - 'parent.borough': { - 'query': 'new york', - 'boost': vs['admin:borough:boost'], - 'analyzer': vs['admin:borough:analyzer'] - } - } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_regions_address.js b/test/unit/fixture/search_regions_address.js index 0a8b199d..bf3f3dce 100644 --- a/test/unit/fixture/search_regions_address.js +++ b/test/unit/fixture/search_regions_address.js @@ -123,14 +123,6 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } - }, { - 'match': { - 'parent.borough': { - 'query': 'manhattan', - 'boost': vs['admin:borough:boost'], - 'analyzer': vs['admin:borough:analyzer'] - } - } }, { 'match': { 'parent.localadmin': { From ee73774c899010d9de62c1d08d380a383e9c90cf Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 28 Apr 2016 19:48:52 +0200 Subject: [PATCH 21/24] add tokenizer, refactor how we determine if a token is 'complete' or 'incomplete' --- query/autocomplete.js | 29 +- query/view/boost_exact_matches.js | 14 +- query/view/ngrams_last_token_only.js | 17 +- query/view/phrase_first_tokens_only.js | 25 +- query/view/pop_subquery.js | 19 +- sanitiser/_tokenizer.js | 95 ++++ sanitiser/autocomplete.js | 1 + .../autocomplete_linguistic_final_token.js | 18 +- test/unit/query/autocomplete.js | 40 +- test/unit/run.js | 1 + test/unit/sanitiser/_tokenizer.js | 425 ++++++++++++++++++ test/unit/sanitiser/autocomplete.js | 5 +- 12 files changed, 616 insertions(+), 73 deletions(-) create mode 100644 sanitiser/_tokenizer.js create mode 100644 test/unit/sanitiser/_tokenizer.js diff --git a/query/autocomplete.js b/query/autocomplete.js index 6da4d569..50f6da29 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -63,31 +63,24 @@ function generateQuery( clean ){ vs.var( 'sources', clean.sources ); } - // mark the name as incomplete (user has not yet typed a comma) - vs.var( 'input:name:isComplete', false ); - - // perform some operations on 'clean.text': - // 1. if there is a space followed by a single char, remove them. - // - this is required as the index uses 2grams and sending 1grams - // - to a 2gram index when using 'type:phrase' or 'operator:and' will - // - result in a complete failure of the query. - // 2. trim leading and trailing whitespace. - // note: single digit grams are now being produced in the name.* index - var text = clean.text.replace(/( [^0-9]$)/g,'').trim(); + // pass the input tokens to the views so they can choose which tokens + // are relevant for their specific function. + if( check.array( clean.tokens ) ){ + vs.var( 'input:name:tokens', clean.tokens ); + vs.var( 'input:name:tokens_complete', clean.tokens_complete ); + vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete ); + } + + // input text + vs.var( 'input:name', clean.text ); // if the input parser has run and suggested a 'parsed_text.name' to use. if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ - // mark the name as complete (user has already typed a comma) - vs.var( 'input:name:isComplete', true ); - // use 'parsed_text.name' instead of 'clean.text'. - text = clean.parsed_text.name; + vs.var( 'input:name', clean.parsed_text.name ); } - // input text - vs.var( 'input:name', text ); - // focus point if( check.number(clean['focus.point.lat']) && check.number(clean['focus.point.lon']) ){ diff --git a/query/view/boost_exact_matches.js b/query/view/boost_exact_matches.js index 8cf575f4..9af56cfb 100644 --- a/query/view/boost_exact_matches.js +++ b/query/view/boost_exact_matches.js @@ -27,19 +27,11 @@ module.exports = function( vs ){ vsCopy.var('phrase:analyzer').set(searchDefaults['phrase:analyzer']); vsCopy.var('phrase:field').set(searchDefaults['phrase:field']); - // split the 'input:name' on whitespace - var name = vs.var('input:name').get(), - tokens = name.split(' '); - - // if the query is incomplete then we need to remove - // the final (incomplete) token as it will not match - // tokens in the phrase.* index. - if( !vs.var('input:name:isComplete').get() ){ - tokens.pop(); - } + // get a copy of the *complete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_complete').get(); // no valid tokens to use, fail now, don't render this view. - if( tokens.length < 1 ){ return null; } + if( !tokens || tokens.length < 1 ){ return null; } // set 'input:name' to be only the fully completed characters vsCopy.var('input:name').set( tokens.join(' ') ); diff --git a/query/view/ngrams_last_token_only.js b/query/view/ngrams_last_token_only.js index 3e3315f7..2665c294 100644 --- a/query/view/ngrams_last_token_only.js +++ b/query/view/ngrams_last_token_only.js @@ -8,9 +8,6 @@ var peliasQuery = require('pelias-query'), eg. if the input was "100 foo str", then 'input:name' would only be 'str' note: it is assumed that the rest of the input is matched using another view. - there is an additional flag 'input:name:isComplete' used to disable this view - selectively, see that section for more info. - code notes: this view makes a copy of the $vs object in order to change their values without mutating the original values, which may be expected in their unaltered form by other views. @@ -18,19 +15,17 @@ var peliasQuery = require('pelias-query'), module.exports = function( vs ){ - // Totally disable this view when bool value 'input:name:isComplete' is true. - // This is the case when the user has typed a comma, so we can assume - // that the 'name' part of the query is now complete. - if( vs.var('input:name:isComplete').get() ){ return null; } + // get a copy of the *tokens_incomplete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_incomplete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } // make a copy Vars so we don't mutate the original var vsCopy = new peliasQuery.Vars( vs.export() ); - // get the input 'name' variable - var name = vs.var('input:name').get(); - // set the 'name' variable in the copy to only the last token - vsCopy.var('input:name').set( name.substr( name.lastIndexOf(' ')+1 ) ); + vsCopy.var('input:name').set( tokens.join(' ') ); // return the view rendered using the copy return ngrams_strict( vsCopy ); diff --git a/query/view/phrase_first_tokens_only.js b/query/view/phrase_first_tokens_only.js index b047b30f..7ab4539b 100644 --- a/query/view/phrase_first_tokens_only.js +++ b/query/view/phrase_first_tokens_only.js @@ -7,9 +7,6 @@ var peliasQuery = require('pelias-query'); eg. if the input was "100 foo str", then 'input:name' would only be '100 foo' note: it is assumed that the rest of the input is matched using another view. - there is an additional flag 'input:name:isComplete' used to disable this view - selectively, see that section for more info. - code notes: this view makes a copy of the $vs object in order to change their values without mutating the original values, which may be expected in their unaltered form by other views. @@ -17,27 +14,17 @@ var peliasQuery = require('pelias-query'); module.exports = function( vs ){ - // Don't mutate the name variable when 'input:name:isComplete' is true. - // This is the case when the user has typed a comma, so we can assume - // that the 'name' part of the query is now complete. - if( vs.var('input:name:isComplete').get() ){ - // return the view rendered using the original vars - return peliasQuery.view.phrase( vs ); - } + // get a copy of the *complete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_complete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } // make a copy Vars so we don't mutate the original var vsCopy = new peliasQuery.Vars( vs.export() ); - // get the input 'name' variable and split in to tokens - var name = vs.var('input:name').get(), - tokens = name.split(' '); - - // single token only, abort (we don't want the *last* token) - // return null here will completely disable the view. - if( tokens.length < 2 ){ return null; } - // set the 'name' variable in the copy to all but the last token - vsCopy.var('input:name').set( name.substr( 0, name.lastIndexOf(' ') ) ); + vsCopy.var('input:name').set( tokens.join(' ') ); // return the view rendered using the copy return peliasQuery.view.phrase( vsCopy ); diff --git a/query/view/pop_subquery.js b/query/view/pop_subquery.js index d18b9963..f29191fc 100644 --- a/query/view/pop_subquery.js +++ b/query/view/pop_subquery.js @@ -1,5 +1,6 @@ -var peliasQuery = require('pelias-query'); +var peliasQuery = require('pelias-query'), + check = require('check-types'); /** Population / Popularity subquery @@ -12,5 +13,21 @@ module.exports = function( vs ){ view.match['name.default'].analyzer = vs.var('phrase:analyzer'); delete view.match['name.default'].boost; + // only use complete tokens against the phase index (where possible). + var completeTokens = vs.var('input:name:tokens_complete').get(), + incompleteTokens = vs.var('input:name:tokens_incomplete').get(); + + // if the tokenizer has run (autocomplete only) then we will combine the + // 'complete' tokens with the 'incomplete' tokens, the resuting array differs + // slightly from the 'input:name:tokens' array as some tokens might have been + // removed in the process; such as single grams which are not present in then + // ngrams index. + if( check.array( completeTokens ) && check.array( incompleteTokens ) ){ + var combined = completeTokens.concat( incompleteTokens ); + if( combined.length ){ + view.match['name.default'].query = combined.join(' '); + } + } + return view; }; diff --git a/sanitiser/_tokenizer.js b/sanitiser/_tokenizer.js new file mode 100644 index 00000000..7b8e234c --- /dev/null +++ b/sanitiser/_tokenizer.js @@ -0,0 +1,95 @@ + +var check = require('check-types'); + +/** + simplified version of the elaticsearch tokenizer, used in order to + be able to detect which tokens are 'complete' (user has finished typing them) + or 'incomplete' (the user has possibly only typed part of the token). + + note: we don't need to strip punctuation as that will be handled on the + elasticsearch side, so sending a token such as 'st.' is not an issue, these + tokens should *not* be modified as the anaylsis can use the punctuation to + infer meaning. + + note: this sanitizer should run *after* the '_text' sanitizer so it can + use the output of clean.parsed_text where available. +**/ +function sanitize( raw, clean ){ + + // error & warning messages + var messages = { errors: [], warnings: [] }; + + // this is the string we will use for analysis + var text = clean.text; + + // a boolean to track whether the input parser successfully ran; or not. + var inputParserRanSuccessfully = false; + + // if the text parser has run then we only tokenize the 'name' section + // of the 'parsed_text' object, ignoring the 'admin' parts. + if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ + inputParserRanSuccessfully = true; + text = clean.parsed_text.name; // use this string instead + } + + // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors. + clean.tokens = []; + clean.tokens_complete = []; + clean.tokens_incomplete = []; + + // sanity check that the text is valid. + if( check.nonEmptyString( text ) ){ + + // split according to the regex used in the elasticsearch tokenizer + // see: https://github.com/pelias/schema/blob/master/settings.js + // see: settings.analysis.tokenizer.peliasNameTokenizer + clean.tokens = text + .split(/[\s,\\\/]+/) // split on delimeters + .filter(function(el){return el;}); // remove empty elements + } + + /** + the following section splits the tokens in to two arrays called + 'tokens_complete' and 'tokens_incomplete'. + + it also strips any tokens from 'tokens_incomplete' which might not + match the ngrams index (such as single grams not stored in the index). + **/ + + // split the tokens in to 'complete' and 'incomplete'. + if( clean.tokens.length ){ + + // if all the tokens are complete, simply copy them from clean.tokens + if( inputParserRanSuccessfully ){ + + // all these tokens are complete! + clean.tokens_complete = clean.tokens.slice(); + + // user hasn't finished typing yet + } else { + + // make a copy of the tokens and remove the last element + var tokensCopy = clean.tokens.slice(), + lastToken = tokensCopy.pop(); + + // set all but the last token as 'complete' + clean.tokens_complete = tokensCopy; + + /** + if the last token is a single non-numeric character then we must discard it. + + at time of writing, single non-numeric ngrams are not stored in the index, + sending them as part of the query would result in 0 documents being returned. + **/ + if( lastToken && ( lastToken.length > 1 || lastToken.match(/[0-9]/) ) ){ + clean.tokens_incomplete = [ lastToken ]; + } + } + + } + + return messages; +} + +// export function +module.exports = sanitize; diff --git a/sanitiser/autocomplete.js b/sanitiser/autocomplete.js index f9698956..8ab6fd9c 100644 --- a/sanitiser/autocomplete.js +++ b/sanitiser/autocomplete.js @@ -4,6 +4,7 @@ var sanitizeAll = require('../sanitiser/sanitizeAll'), sanitizers = { singleScalarParameters: require('../sanitiser/_single_scalar_parameters'), text: require('../sanitiser/_text'), + tokenizer: require('../sanitiser/_tokenizer'), size: require('../sanitiser/_size')(10, 10, 10), layers: require('../sanitiser/_targets')('layers', type_mapping.layer_mapping), sources: require('../sanitiser/_targets')('sources', type_mapping.source_mapping), diff --git a/test/unit/fixture/autocomplete_linguistic_final_token.js b/test/unit/fixture/autocomplete_linguistic_final_token.js index fc431c77..b4cc33d2 100644 --- a/test/unit/fixture/autocomplete_linguistic_final_token.js +++ b/test/unit/fixture/autocomplete_linguistic_final_token.js @@ -7,15 +7,25 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', + 'boost': 1, + 'slop': 3, 'query': 'one', - 'type': 'phrase', - 'operator': 'and' + 'type': 'phrase' } } }], 'should':[{ + 'match': { + 'phrase.default': { + 'analyzer': 'peliasPhrase', + 'boost': 1, + 'slop': 3, + 'query': 'one', + 'type': 'phrase' + } + } + },{ 'function_score': { 'query': { 'match': { diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index 72cfb5f2..bb368fc9 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -13,7 +13,10 @@ module.exports.tests.interface = function(test, common) { module.exports.tests.query = function(test, common) { test('valid lingustic-only autocomplete', function(t) { var query = generate({ - text: 'test' + text: 'test', + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -25,7 +28,10 @@ module.exports.tests.query = function(test, common) { test('valid lingustic autocomplete with 3 tokens', function(t) { var query = generate({ - text: 'one two three' + text: 'one two three', + tokens: ['one','two','three'], + tokens_complete: ['one','two'], + tokens_incomplete: ['three'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -42,7 +48,10 @@ module.exports.tests.query = function(test, common) { name: 'one two', regions: [ 'one two', 'three' ], admin_parts: 'three' - } + }, + tokens: ['one','two'], + tokens_complete: ['one','two'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -57,7 +66,10 @@ module.exports.tests.query = function(test, common) { // note: if 1 grams are enabled at a later date, remove this behaviour. test('valid lingustic autocomplete final token', function(t) { var query = generate({ - text: 'one t' + text: 'one t', + tokens: ['one','t'], + tokens_complete: ['one'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -71,7 +83,10 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'test', 'focus.point.lat': 29.49136, - 'focus.point.lon': -82.50622 + 'focus.point.lon': -82.50622, + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -85,7 +100,10 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'test', 'focus.point.lat': 0, - 'focus.point.lon': 0 + 'focus.point.lon': 0, + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -98,7 +116,10 @@ module.exports.tests.query = function(test, common) { test('valid sources filter', function(t) { var query = generate({ 'text': 'test', - 'sources': ['test_source'] + 'sources': ['test_source'], + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -115,7 +136,10 @@ module.exports.tests.query = function(test, common) { name: 'k road', street: 'k road', regions: [ 'laird' ] - } + }, + tokens: ['k', 'road'], + tokens_complete: ['k', 'road'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); diff --git a/test/unit/run.js b/test/unit/run.js index 94d9ebb3..1a6f7a90 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -46,6 +46,7 @@ var tests = [ require('./sanitiser/_sources'), require('./sanitiser/_sources_and_layers'), require('./sanitiser/_text'), + require('./sanitiser/_tokenizer'), require('./sanitiser/_deprecate_quattroshapes'), require('./src/backend'), require('./sanitiser/autocomplete'), diff --git a/test/unit/sanitiser/_tokenizer.js b/test/unit/sanitiser/_tokenizer.js new file mode 100644 index 00000000..a7c6ced4 --- /dev/null +++ b/test/unit/sanitiser/_tokenizer.js @@ -0,0 +1,425 @@ +var sanitiser = require('../../../sanitiser/_tokenizer'); + +module.exports.tests = {}; + +module.exports.tests.sanity_checks = function(test, common) { + test('clean.text not set', function(t) { + + var clean = {}; // clean.text not set + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('clean.text not a string', function(t) { + + var clean = { text: {} }; // clean.text not a string + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('empty string', function(t) { + + var clean = { text: '' }; + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) { + + var clean = { parsed_text: { text: {} } }; + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text.name over clean.text', function(t) { + + var clean = { parsed_text: { name: 'foo' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over clean.text + t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.space_delimiter = function(test, common) { + test('space delimiter - simple', function(t) { + + var clean = { text: '30 west 26th street new york' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('space delimiter - multiple spaces / other whitespace', function(t) { + + var clean = { text: ' 30 west \t26th \nstreet new york ' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.comma_delimiter = function(test, common) { + test('comma delimiter - simple', function(t) { + + var clean = { text: '30 west 26th street, new york' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('comma delimiter - multiple commas', function(t) { + + var clean = { text: ',30 west 26th street,,, new york,' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.forward_slash_delimiter = function(test, common) { + test('forward slash delimiter - simple', function(t) { + + var clean = { text: 'Bedell Street/133rd Avenue' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'Bedell', + 'Street', + '133rd' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('forward slash - multiple slashes', function(t) { + + var clean = { text: '/Bedell Street//133rd Avenue/' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'Bedell', + 'Street', + '133rd' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.final_token_single_gram = function(test, common) { + test('final token single gram - numeric', function(t) { + + var clean = { text: 'grolmanstrasse 1' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'grolmanstrasse', + '1' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'grolmanstrasse', + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + '1' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('final token single gram - non-numeric', function(t) { + + var clean = { text: 'grolmanstrasse a' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'grolmanstrasse', + 'a' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'grolmanstrasse', + ], 'tokens produced'); + + // last token removed! + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.back_slash_delimiter = function(test, common) { + test('back slash delimiter - simple', function(t) { + + var clean = { text: 'Bedell Street\\133rd Avenue' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('back slash - multiple slashes', function(t) { + + var clean = { text: '\\Bedell Street\\\\133rd Avenue\\' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.mixed_delimiter = function(test, common) { + test('mixed delimiters', function(t) { + + var clean = { text: ',/Bedell Street\\, \n\t ,\\//133rd Avenue, /\n/' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.all = function (tape, common) { + function test(name, testFunction) { + return tape('SANITISER _tokenizer: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; diff --git a/test/unit/sanitiser/autocomplete.js b/test/unit/sanitiser/autocomplete.js index 26bf9afb..186cb4b6 100644 --- a/test/unit/sanitiser/autocomplete.js +++ b/test/unit/sanitiser/autocomplete.js @@ -4,7 +4,10 @@ module.exports.tests = {}; module.exports.tests.sanitisers = function(test, common) { test('check sanitiser list', function (t) { - var expected = ['singleScalarParameters', 'text', 'size', 'layers', 'sources', 'sources_and_layers', 'private', 'geo_autocomplete' ]; + var expected = [ + 'singleScalarParameters', 'text', 'tokenizer', 'size', 'layers', 'sources', + 'sources_and_layers', 'private', 'geo_autocomplete' + ]; t.deepEqual(Object.keys(autocomplete.sanitiser_list), expected); t.end(); }); From 05240626fd4e0b798df86d8f82b618509c44ddba Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 29 Apr 2016 16:56:33 +0200 Subject: [PATCH 22/24] handle addressit case where parsed_text.street is produced and parsed_text.name is not --- sanitiser/_tokenizer.js | 21 ++++++++++++++++++-- test/unit/sanitiser/_tokenizer.js | 32 +++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/sanitiser/_tokenizer.js b/sanitiser/_tokenizer.js index 7b8e234c..3312ea05 100644 --- a/sanitiser/_tokenizer.js +++ b/sanitiser/_tokenizer.js @@ -27,9 +27,26 @@ function sanitize( raw, clean ){ // if the text parser has run then we only tokenize the 'name' section // of the 'parsed_text' object, ignoring the 'admin' parts. - if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ + if( clean.hasOwnProperty('parsed_text') ) { inputParserRanSuccessfully = true; - text = clean.parsed_text.name; // use this string instead + + // parsed_text.name is set, this is the highest priority, use this string + if( clean.parsed_text.hasOwnProperty('name') ){ + text = clean.parsed_text.name; // use this string instead + } + + // else handle the case where parsed_text.street was produced but + // no parsed_text.name is produced. + // additionally, handle the case where parsed_text.number is present + // note: the addressit module may also produce parsed_text.unit info + // for now, we discard that information as we don't have an appropriate + else if( clean.parsed_text.hasOwnProperty('street') ){ + text = [ + clean.parsed_text.number, + clean.parsed_text.street + ].filter(function(el){return el;}) + .join(' '); // remove empty elements + } } // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors. diff --git a/test/unit/sanitiser/_tokenizer.js b/test/unit/sanitiser/_tokenizer.js index a7c6ced4..8837d4ab 100644 --- a/test/unit/sanitiser/_tokenizer.js +++ b/test/unit/sanitiser/_tokenizer.js @@ -81,6 +81,38 @@ module.exports.tests.sanity_checks = function(test, common) { t.deepEquals(messages.errors, [], 'no errors'); t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + }); + test('favor clean.parsed_text street data over clean.text', function(t) { + + var clean = { parsed_text: { number: '190', street: 'foo st' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over clean.text + t.deepEquals(clean.tokens, [ '190', 'foo', 'st' ], 'use street name + number'); + t.deepEquals(clean.tokens_complete, [ '190', 'foo', 'st' ], 'use street name + number'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text.name over clean.parsed_text street data', function(t) { + + var clean = { parsed_text: { number: '190', street: 'foo st', name: 'foo' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over all other variables + t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); }); }; From 1c9af40f3cc274cfe77e33716501f2c4a0a26f7a Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 29 Apr 2016 17:16:44 +0200 Subject: [PATCH 23/24] remove query.tokens_complete and query.tokens_incomplete from geoJSON --- middleware/geocodeJSON.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/middleware/geocodeJSON.js b/middleware/geocodeJSON.js index 80d69e86..414d216b 100644 --- a/middleware/geocodeJSON.js +++ b/middleware/geocodeJSON.js @@ -16,7 +16,7 @@ function setup(peliasConfig, basePath) { config: peliasConfig || require('pelias-config').generate().api, basePath: basePath || '/' }; - + function middleware(req, res, next) { return convertToGeocodeJSON(req, res, next, opts); } @@ -56,6 +56,10 @@ function convertToGeocodeJSON(req, res, next, opts) { // Helpful for debugging and understanding how the input impacts results. res.body.geocoding.query = req.clean; + // remove arrays produced by the tokenizer (only intended to be used internally). + delete res.body.geocoding.query.tokens_complete; + delete res.body.geocoding.query.tokens_incomplete; + // OPTIONAL. Warnings and errors. addMessages(req, 'warnings', res.body.geocoding); addMessages(req, 'errors', res.body.geocoding); From 979aab1ac3231cf598ebcf0df1c4d6f2fee26fa4 Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 29 Apr 2016 19:10:52 +0200 Subject: [PATCH 24/24] ensure that problematic single grams are removed from the query --- query/autocomplete.js | 15 ++++++++++----- query/view/pop_subquery.js | 16 ---------------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index 50f6da29..6d5863a8 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -74,11 +74,16 @@ function generateQuery( clean ){ // input text vs.var( 'input:name', clean.text ); - // if the input parser has run and suggested a 'parsed_text.name' to use. - if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ - - // use 'parsed_text.name' instead of 'clean.text'. - vs.var( 'input:name', clean.parsed_text.name ); + // if the tokenizer has run then we set 'input:name' to as the combination of the + // 'complete' tokens with the 'incomplete' tokens, the resuting array differs + // slightly from the 'input:name:tokens' array as some tokens might have been + // removed in the process; such as single grams which are not present in then + // ngrams index. + if( check.array( clean.tokens_complete ) && check.array( clean.tokens_incomplete ) ){ + var combined = clean.tokens_complete.concat( clean.tokens_incomplete ); + if( combined.length ){ + vs.var( 'input:name', combined.join(' ') ); + } } // focus point diff --git a/query/view/pop_subquery.js b/query/view/pop_subquery.js index f29191fc..724b773f 100644 --- a/query/view/pop_subquery.js +++ b/query/view/pop_subquery.js @@ -13,21 +13,5 @@ module.exports = function( vs ){ view.match['name.default'].analyzer = vs.var('phrase:analyzer'); delete view.match['name.default'].boost; - // only use complete tokens against the phase index (where possible). - var completeTokens = vs.var('input:name:tokens_complete').get(), - incompleteTokens = vs.var('input:name:tokens_incomplete').get(); - - // if the tokenizer has run (autocomplete only) then we will combine the - // 'complete' tokens with the 'incomplete' tokens, the resuting array differs - // slightly from the 'input:name:tokens' array as some tokens might have been - // removed in the process; such as single grams which are not present in then - // ngrams index. - if( check.array( completeTokens ) && check.array( incompleteTokens ) ){ - var combined = completeTokens.concat( incompleteTokens ); - if( combined.length ){ - view.match['name.default'].query = combined.join(' '); - } - } - return view; };