From ab436d5dfb509e6af39f8295d70edd3c357ffc10 Mon Sep 17 00:00:00 2001 From: Diana Shkolnikov Date: Mon, 10 Aug 2015 16:46:19 -0400 Subject: [PATCH] Search query builder not checking all available admin values Fixes #187 --- helper/address_weights.js | 13 ++ helper/adminFields.js | 30 ++++ helper/category_weights.js | 9 +- package.json | 5 +- query/search.js | 132 ++++++++-------- query/sort.js | 11 +- test/unit/helper/adminFields.js | 84 +++++++++++ test/unit/query/reverse.js | 2 +- test/unit/query/search.js | 257 ++++++++++++++++++++++++++++++-- test/unit/query/sort.js | 2 +- test/unit/run.js | 3 +- 11 files changed, 466 insertions(+), 82 deletions(-) create mode 100644 helper/address_weights.js create mode 100644 helper/adminFields.js create mode 100644 test/unit/helper/adminFields.js diff --git a/helper/address_weights.js b/helper/address_weights.js new file mode 100644 index 00000000..6b15ab4c --- /dev/null +++ b/helper/address_weights.js @@ -0,0 +1,13 @@ +/** + * These values specify how much a document that matches certain parts of an address + * should be boosted in elasticsearch results. + */ + +module.exports = { + number: 1, + street: 3, + zip: 3, + admin2: 2, + admin1_abbr: 3, + alpha3: 5 +}; diff --git a/helper/adminFields.js b/helper/adminFields.js new file mode 100644 index 00000000..aa96c203 --- /dev/null +++ b/helper/adminFields.js @@ -0,0 +1,30 @@ +var schema = require('pelias-schema'); +var logger = require( 'pelias-logger' ).get( 'api' ); + +var ADMIN_FIELDS = [ + 'admin0', + 'admin1', + 'admin1_abbr', + 'admin2', + 'local_admin', + 'locality', + 'neighborhood' +]; + +function getAvailableAdminFields() { + var actualFields = Object.keys(schema.mappings._default_.properties); + + // check if expected fields are actually in current schema + var available = ADMIN_FIELDS.filter(function (field) { + return (actualFields.indexOf(field) !== -1); + }); + + if (available.length === 0) { + logger.error('helper/adminFields: no expected admin fields found in schema'); + } + + return available; +} + +module.exports.availableFields = getAvailableAdminFields(); +module.exports.expectedFields = ADMIN_FIELDS; \ No newline at end of file diff --git a/helper/category_weights.js b/helper/category_weights.js index 535ad2f2..5176b70e 100644 --- a/helper/category_weights.js +++ b/helper/category_weights.js @@ -3,7 +3,14 @@ * should be boosted in elasticsearch results. */ -module.exports = { +module.exports.default = { + 'transport:air': 2, + 'transport:air:aerodrome': 2, + 'transport:air:airport': 2, + 'admin': 2 +}; + +module.exports.address = { 'transport:air': 2, 'transport:air:aerodrome': 2, 'transport:air:airport': 2 diff --git a/package.json b/package.json index dcf30f49..a9c3c7e1 100644 --- a/package.json +++ b/package.json @@ -33,9 +33,11 @@ "elasticsearch": ">=1.2.1" }, "dependencies": { + "addressit": "1.3.0", "async": "^0.9.0", "cluster2": "git://github.com/missinglink/cluster2.git#node_zero_twelve", "express": "^4.8.8", + "extend": "2.0.1", "geojson": "^0.2.1", "geojson-extent": "^0.3.1", "geopipes-elasticsearch-backend": "^0.2.0", @@ -44,10 +46,9 @@ "microtime": "1.4.0", "morgan": "1.5.2", "pelias-config": "^0.1.4", - "extend": "2.0.1", - "addressit": "1.3.0", "pelias-esclient": "0.0.25", "pelias-logger": "^0.0.8", + "pelias-schema": "1.0.0", "pelias-suggester-pipeline": "2.0.2", "through2": "0.6.5" }, diff --git a/query/search.js b/query/search.js index 8e34b404..41a4bff6 100644 --- a/query/search.js +++ b/query/search.js @@ -1,6 +1,9 @@ var queries = require('geopipes-elasticsearch-backend').queries, - sort = require('../query/sort'); + sort = require('../query/sort'), + adminFields = require('../helper/adminFields').availableFields, + addressWeights = require('../helper/address_weights'); + function generate( params ){ var centroid = null; @@ -27,22 +30,7 @@ function generate( params ){ }; if (params.parsed_input) { - - query.query.filtered.query.bool.should = []; - - var unmatched_admin_fields = []; - // qb stands for query builder - var qb = function(unmatched_admin_fields, value) { - if (value) { - unmatched_admin_fields.forEach(function(admin_field) { - var match = {}; - match[admin_field] = value; - query.query.filtered.query.bool.should.push({ - 'match': match - }); - }); - } - }; + addParsedMatch(query, input, params.parsed_input); // update input if (params.parsed_input.number && params.parsed_input.street) { @@ -50,53 +38,6 @@ function generate( params ){ } else if (params.parsed_input.admin_parts) { input = params.parsed_input.name; } - - // address - // number, street, postalcode - if (params.parsed_input.number) { - qb(['address.number'], params.parsed_input.number); - } - if (params.parsed_input.street) { - qb(['address.street'], params.parsed_input.street); - } - if (params.parsed_input.postalcode) { - qb(['address.zip'], params.parsed_input.postalcode); - } - - // city - // admin2, locality, local_admin, neighborhood - if (params.parsed_input.city) { - qb(['admin2'], params.parsed_input.admin2); - } else { - unmatched_admin_fields.push('admin2'); - } - - // state - // admin1, admin1_abbr - if (params.parsed_input.state) { - qb(['admin1_abbr'], params.parsed_input.state); - } else { - unmatched_admin_fields.push('admin1', 'admin1_abbr'); - } - - // country - // admin0, alpha3 - if (params.parsed_input.country) { - qb(['alpha3'], params.parsed_input.country); - } else { - unmatched_admin_fields.push('admin0', 'alpha3'); - } - - var input_regions = params.parsed_input.regions ? params.parsed_input.regions.join(' ') : undefined; - // if no address was identified and input suggests some admin info in it - if (unmatched_admin_fields.length === 5 && input_regions !== params.input) { - if (params.parsed_input.admin_parts) { - qb(unmatched_admin_fields, params.parsed_input.admin_parts); - } else { - qb(unmatched_admin_fields, input_regions); - } - } - } // add search condition to distance query @@ -119,4 +60,67 @@ function generate( params ){ return query; } +function addParsedMatch(query, defaultInput, parsedInput) { + query.query.filtered.query.bool.should = []; + + // copy expected admin fields so we can remove them as we parse the address + var unmatchedAdminFields = adminFields.slice(); + + // address + // number, street, postalcode + addMatch(query, unmatchedAdminFields, 'address.number', parsedInput.number, addressWeights.number); + addMatch(query, unmatchedAdminFields, 'address.street', parsedInput.street, addressWeights.street); + addMatch(query, unmatchedAdminFields, 'address.zip', parsedInput.postalcode, addressWeights.zip); + + // city + // admin2, locality, local_admin, neighborhood + addMatch(query, unmatchedAdminFields, 'admin2', parsedInput.admin2, addressWeights.admin2); + + // state + // admin1, admin1_abbr + addMatch(query, unmatchedAdminFields, 'admin1_abbr', parsedInput.state, addressWeights.admin1_abbr); + + // country + // admin0, alpha3 + addMatch(query, unmatchedAdminFields, 'alpha3', parsedInput.country, addressWeights.alpha3); + + var inputRegions = parsedInput.regions ? parsedInput.regions.join(' ') : undefined; + // if no address was identified and input suggests some admin info in it + if (unmatchedAdminFields.length > 0 && inputRegions !== defaultInput) { + unmatchedAdminFields.forEach(function (key) { + if (parsedInput.admin_parts) { + addMatch(query, [], key, parsedInput.admin_parts); + } + else { + addMatch(query, [], key, inputRegions); + } + }); + } +} + +function addMatch(query, unmatched, key, value, boost) { // jshint ignore:line + if (typeof value === 'undefined') { + return; + } + + var match = {}; + + if (boost) { + match[key] = { + query: value, + boost: boost + }; + } + else { + match[key] = value; + } + + query.query.filtered.query.bool.should.push({ 'match': match }); + + var index = unmatched.indexOf(key); + if (index !== -1) { + unmatched.splice(index, 1); + } +} + module.exports = generate; diff --git a/query/sort.js b/query/sort.js index 13f530c9..aaca76e5 100644 --- a/query/sort.js +++ b/query/sort.js @@ -43,7 +43,7 @@ module.exports = function( params ){ { '_script': { 'params': { - 'category_weights': category_weights + 'category_weights': getCategoryWeights(params) }, 'file': category, 'type': 'number', @@ -64,3 +64,12 @@ module.exports = function( params ){ return scriptsConfig; }; + +function getCategoryWeights(params) { + if (params && params.hasOwnProperty('parsed_input') && + (params.parsed_input.hasOwnProperty('number') || + params.parsed_input.hasOwnProperty('street'))) { + return category_weights.address; + } + return category_weights.default; +} \ No newline at end of file diff --git a/test/unit/helper/adminFields.js b/test/unit/helper/adminFields.js new file mode 100644 index 00000000..9d72b934 --- /dev/null +++ b/test/unit/helper/adminFields.js @@ -0,0 +1,84 @@ +var proxyquire = require('proxyquire'); + +module.exports.tests = {}; + +module.exports.tests.interface = function(test, common) { + test('validate fields', function(t) { + var adminFields = require('../../../helper/adminFields').availableFields; + t.assert(adminFields instanceof Array, 'adminFields is an array'); + t.assert(adminFields.length > 0, 'adminFields array is not empty'); + t.end(); + }); + test('validate fields', function(t) { + var adminFields = require('../../../helper/adminFields').expectedFields; + t.assert(adminFields instanceof Array, 'adminFields is an array'); + t.assert(adminFields.length > 0, 'adminFields array is not empty'); + t.end(); + }); +}; + +module.exports.tests.lookupExistance = function(test, common) { + test('all expected fields in schema', function(t) { + + var expectedFields = require('../../../helper/adminFields').expectedFields; + var schemaMock = { mappings: { _default_: { properties: {} } } }; + + // inject all expected fields into schema mock + expectedFields.forEach(function (field) { + schemaMock.mappings._default_.properties[field] = {}; + }); + + var adminFields = proxyquire('../../../helper/adminFields', {'pelias-schema': schemaMock}); + + t.deepEquals(adminFields.availableFields, adminFields.expectedFields, 'all expected fields are returned'); + t.end(); + }); + + test('some expected fields in schema', function(t) { + + var expectedFields = require('../../../helper/adminFields').expectedFields.slice(0, 3); + var schemaMock = { mappings: { _default_: { properties: {} } } }; + + // inject all expected fields into schema mock + expectedFields.forEach(function (field) { + schemaMock.mappings._default_.properties[field] = {}; + }); + + var adminFields = proxyquire('../../../helper/adminFields', {'pelias-schema': schemaMock}); + + t.deepEquals(adminFields.availableFields, expectedFields, 'only matching expected fields are returned'); + t.end(); + }); + + test('no expected fields in schema', function(t) { + + var schemaMock = { mappings: { _default_: { properties: { foo: {} } } } }; + + var loggerMock = { get: function (name) { + t.equal(name, 'api'); + return { + error: function () {} + }; + }}; + + var adminFields = proxyquire('../../../helper/adminFields', + { + 'pelias-schema': schemaMock, + 'pelias-logger': loggerMock + }); + + t.deepEquals([], adminFields.availableFields, 'no admin fields found'); + t.end(); + }); +}; + +module.exports.all = function (tape, common) { + + function test(name, testFunction) { + return tape('adminFields: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; \ No newline at end of file diff --git a/test/unit/query/reverse.js b/test/unit/query/reverse.js index a84bf1c7..e200b832 100644 --- a/test/unit/query/reverse.js +++ b/test/unit/query/reverse.js @@ -53,7 +53,7 @@ var sort = [ { '_script': { 'params': { - 'category_weights': category_weights + 'category_weights': category_weights.default }, 'file': category, 'type': 'number', diff --git a/test/unit/query/search.js b/test/unit/query/search.js index dcc9d745..0413706e 100644 --- a/test/unit/query/search.js +++ b/test/unit/query/search.js @@ -54,7 +54,7 @@ var sort = [ { '_script': { 'params': { - 'category_weights': category_weights + 'category_weights': category_weights.default }, 'file': category, 'type': 'number', @@ -267,27 +267,72 @@ module.exports.tests.query = function(test, common) { 'should': [ { 'match': { - 'address.number': 123 + 'address.number': { + 'query': 123, + 'boost': 1 + } } }, { 'match': { - 'address.street': 'main st' + 'address.street': { + 'query': 'main st', + 'boost': 3 + } } }, { 'match': { - 'address.zip': 10010 + 'address.zip': { + 'query': 10010, + 'boost': 3 + } } }, { 'match': { - 'admin1_abbr': 'NY' + 'admin1_abbr': { + 'query': 'NY', + 'boost': 3 + } } }, { 'match': { - 'alpha3': 'USA' + 'alpha3': { + 'query': 'USA', + 'boost': 5 + } + } + }, + { + match: { + admin0: 'new york' + } + }, + { + match: { + admin1: 'new york' + } + }, + { + match: { + admin2: 'new york' + } + }, + { + match: { + local_admin: 'new york' + } + }, + { + match: { + locality: 'new york' + } + }, + { + match: { + neighborhood: 'new york' } }, { @@ -416,7 +461,7 @@ module.exports.tests.query = function(test, common) { 'should': [ { 'match': { - 'admin2': 'new york' + 'admin0': 'new york' } }, { @@ -431,12 +476,22 @@ module.exports.tests.query = function(test, common) { }, { 'match': { - 'admin0': 'new york' + 'admin2': 'new york' } }, { 'match': { - 'alpha3': 'new york' + 'local_admin': 'new york' + } + }, + { + 'match': { + 'locality': 'new york' + } + }, + { + 'match': { + 'neighborhood': 'new york' } }, { @@ -501,7 +556,8 @@ module.exports.tests.query = function(test, common) { 'category_weights': { 'transport:air': 2, 'transport:air:aerodrome': 2, - 'transport:air:airport': 2 + 'transport:air:airport': 2, + 'admin': 2 } }, 'file': 'category', @@ -534,10 +590,189 @@ module.exports.tests.query = function(test, common) { ], 'track_scores': true }; - + + t.deepEqual(query, expected, 'valid search query'); + t.end(); + }); + + test('valid query with regions in address', function(t) { + var partial_address = '1 water st manhattan ny'; + var query = generate({ input: partial_address, + layers: [ 'geoname', 'osmnode', 'osmway', 'admin0', 'admin1', 'admin2', 'neighborhood', + 'locality', 'local_admin', 'osmaddress', 'openaddresses' ], + size: 10, + details: true, + parsed_input: parser(partial_address), + default_layers_set: true + }); + + var expected = { + 'query': { + 'filtered': { + 'query': { + 'bool': { + 'must': [ + { + 'match': { + 'name.default': '1 water st' + } + } + ], + 'should': [ + { + match: { + 'address.number': { + 'query': 1, + 'boost': 1 + } + } + }, + { + match: { + 'address.street': { + 'query': 'water st', + 'boost': 3 + } + } + }, + { + 'match': { + 'admin1_abbr': { + 'query': 'NY', + 'boost': 3 + } + } + }, + { + 'match': { + 'admin0': 'manhattan' + } + }, + { + 'match': { + 'admin1': 'manhattan' + } + }, + { + 'match': { + 'admin2': 'manhattan' + } + }, + { + match: { + local_admin: 'manhattan' + } + }, + { + match: { + locality: 'manhattan' + } + }, + { + match: { + neighborhood: 'manhattan' + } + }, + { + 'match': { + 'phrase.default': '1 water st' + } + } + ] + } + }, + 'filter': { + 'bool': { + 'must': [] + } + } + } + }, + 'size': 10, + 'sort': [ + '_score', + { + '_script': { + 'file': 'admin_boost', + 'type': 'number', + 'order': 'desc' + } + }, + { + '_script': { + 'file': 'popularity', + 'type': 'number', + 'order': 'desc' + } + }, + { + '_script': { + 'file': 'population', + 'type': 'number', + 'order': 'desc' + } + }, + { + '_script': { + 'params': { + 'weights': { + 'admin0': 4, + 'admin1': 3, + 'admin2': 2, + 'local_admin': 1, + 'locality': 1, + 'neighborhood': 1 + } + }, + 'file': 'weights', + 'type': 'number', + 'order': 'desc' + } + }, + { + '_script': { + 'params': { + 'category_weights': { + 'transport:air': 2, + 'transport:air:aerodrome': 2, + 'transport:air:airport': 2 + } + }, + 'file': 'category', + 'type': 'number', + 'order': 'desc' + } + }, + { + '_script': { + 'params': { + 'weights': { + 'geoname': 0, + 'address': 4, + 'osmnode': 6, + 'osmway': 6, + 'poi-address': 8, + 'neighborhood': 10, + 'local_admin': 12, + 'locality': 12, + 'admin2': 12, + 'admin1': 14, + 'admin0': 2 + } + }, + 'file': 'weights', + 'type': 'number', + 'order': 'desc' + } + } + ], + 'track_scores': true + }; + t.deepEqual(query, expected, 'valid search query'); t.end(); }); + }; module.exports.all = function (tape, common) { diff --git a/test/unit/query/sort.js b/test/unit/query/sort.js index 1a2fe5d1..f534d8ae 100644 --- a/test/unit/query/sort.js +++ b/test/unit/query/sort.js @@ -53,7 +53,7 @@ var expected = [ { '_script': { 'params': { - 'category_weights': category_weights + 'category_weights': category_weights.default }, 'file': category, 'type': 'number', diff --git a/test/unit/run.js b/test/unit/run.js index eb7cad06..75051927 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -19,7 +19,8 @@ var tests = [ require('./query/reverse'), require('./helper/query_parser'), require('./helper/geojsonify'), - require('./helper/outputSchema') + require('./helper/outputSchema'), + require('./helper/adminFields'), ]; tests.map(function(t) {