Browse Source

Separate concerns of address parser

The address parser currently does two things:
1.) make some intelligent guesses as to possible admin regions to
explicitly search against to improve the quality of results returned
2.) make some intelligent guesses as to when no part of the query needs
to search against anything other than admin regions. This somewhat
improves the quality of results returned but mostly improves the speed
of the Elasticsearch query since it's searching significantly fewer
recoords.

These two concerns are now split into two separate methods within the
query_parser helper module. They are mostly independent today, but don't
have to be in the future.
pull/221/head
Julian Simioni 9 years ago
parent
commit
7a6ac8541b
  1. 42
      helper/query_parser.js
  2. 5
      sanitiser/_input.js
  3. 17
      test/unit/helper/query_parser.js
  4. 6
      test/unit/query/search.js
  5. 2
      test/unit/sanitiser/_input.js
  6. 7
      test/unit/sanitiser/search.js

42
helper/query_parser.js

@ -1,10 +1,25 @@
var parser = require('addressit'); var parser = require('addressit');
var extend = require('extend'); var extend = require('extend');
var get_layers = require('../helper/layers'); var get_layers_helper = require('../helper/layers');
var delim = ','; var delim = ',';
module.exports = function(query) { module.exports = {};
module.exports.get_layers = function get_layers(query) {
var tokenized = query.split(/[ ,]+/);
var hasNumber = /\d/.test(query);
if (query.length <= 3 ) {
// no address parsing required
return get_layers_helper(['admin']);
} else if (tokenized.length === 1 || (tokenized.length < 3 && !hasNumber)) {
// no need to hit address layers if there's only one (or two) token(s)
return get_layers_helper(['admin', 'poi']);
}
};
module.exports.get_parsed_address = function get_parsed_address(query) {
var tokenized = query.split(/[ ,]+/); var tokenized = query.split(/[ ,]+/);
var hasNumber = /\d/.test(query); var hasNumber = /\d/.test(query);
@ -22,32 +37,18 @@ module.exports = function(query) {
return address; return address;
}; };
var getTargetLayersWhenAddressParsingIsNotNecessary = function(query) {
var address = {};
// set target_layer if input length <= 3 characters
if (query.length <= 3 ) {
// no address parsing required
address.target_layer = get_layers(['admin']);
} else if (tokenized.length === 1 || (tokenized.length < 3 && !hasNumber)) {
// no need to hit address layers if there's only one (or two) token(s)
address.target_layer = get_layers(['admin', 'poi']);
}
return address.target_layer ? address : null;
};
var getAddressParts = function(query) { var getAddressParts = function(query) {
// perform full address parsing // perform full address parsing
// except on queries so short they obviously can't contain an address // except on queries so short they obviously can't contain an address
if (query.length > 3) {
return parser( query ); return parser( query );
}
}; };
var addressWithAdminParts = getAdminPartsBySplittingOnDelim(query); var addressWithAdminParts = getAdminPartsBySplittingOnDelim(query);
var addressWithTargetLayers= getTargetLayersWhenAddressParsingIsNotNecessary(query); var addressWithAddressParts= getAddressParts(query);
var addressWithAddressParts= !addressWithTargetLayers ? getAddressParts(query) : {};
var parsedAddress = extend(addressWithAdminParts, var parsedAddress = extend(addressWithAdminParts,
addressWithTargetLayers,
addressWithAddressParts); addressWithAddressParts);
var address_parts = [ 'name', var address_parts = [ 'name',
@ -58,8 +59,7 @@ module.exports = function(query) {
'country', 'country',
'postalcode', 'postalcode',
'regions', 'regions',
'admin_parts', 'admin_parts'
'target_layer'
]; ];
var parsed_input = {}; var parsed_input = {};

5
sanitiser/_input.js

@ -21,7 +21,10 @@ function sanitize( req ){
req.clean.input = params.input; req.clean.input = params.input;
req.clean.parsed_input = query_parser(params.input); req.clean.parsed_input = query_parser.get_parsed_address(params.input);
req.clean.types = req.clean.layers || {};
req.clean.types.from_address_parsing = query_parser.get_layers(params.input);
return { 'error': false }; return { 'error': false };
} }

17
test/unit/helper/query_parser.js

@ -6,7 +6,8 @@ module.exports.tests = {};
module.exports.tests.interface = function(test, common) { module.exports.tests.interface = function(test, common) {
test('interface', function(t) { test('interface', function(t) {
t.equal(typeof parser, 'function', 'valid function'); t.equal(typeof parser.get_parsed_address, 'function', 'valid function');
t.equal(typeof parser.get_layers, 'function', 'valid function');
t.end(); t.end();
}); });
}; };
@ -17,7 +18,7 @@ module.exports.tests.split_on_comma = function(test, common) {
var testParse = function(query) { var testParse = function(query) {
test('naive parsing ' + query, function(t) { test('naive parsing ' + query, function(t) {
var address = parser(query); var address = parser.get_parsed_address(query);
var delimIndex = query.indexOf(delim); var delimIndex = query.indexOf(delim);
var name = query.substring(0, delimIndex); var name = query.substring(0, delimIndex);
var admin_parts = query.substring(delimIndex + 1).trim(); var admin_parts = query.substring(delimIndex + 1).trim();
@ -41,11 +42,12 @@ module.exports.tests.parse_three_chars_or_less = function(test, common) {
var testParse = function(query) { var testParse = function(query) {
test('query length < 3 (' + query + ')', function(t) { test('query length < 3 (' + query + ')', function(t) {
var address = parser(query); var address = parser.get_parsed_address(query);
var target_layer = get_layers(['admin']); var target_layer = get_layers(['admin']);
var layers = parser.get_layers(query);
t.equal(typeof address, 'object', 'valid object'); t.equal(typeof address, 'object', 'valid object');
t.deepEqual(address.target_layer, target_layer, 'admin_parts set correctly to ' + target_layer.join(', ')); t.deepEqual(layers, target_layer, 'admin_parts set correctly to ' + target_layer.join(', '));
t.end(); t.end();
}); });
}; };
@ -63,15 +65,16 @@ module.exports.tests.parse_one_or_more_tokens = function(test, common) {
var testParse = function(query, parse_address) { var testParse = function(query, parse_address) {
test('query with one or more tokens (' + query + ')', function(t) { test('query with one or more tokens (' + query + ')', function(t) {
var address = parser(query); var address = parser.get_parsed_address(query);
var target_layer = get_layers(['admin', 'poi']); var target_layer = get_layers(['admin', 'poi']);
var layers = parser.get_layers(query);
t.equal(typeof address, 'object', 'valid object'); t.equal(typeof address, 'object', 'valid object');
if (parse_address) { if (parse_address) {
t.deepEqual(address.regions.join(''), query, 'since query contained a number, it went through address parsing'); t.deepEqual(address.regions.join(''), query, 'since query contained a number, it went through address parsing');
} else { } else {
t.deepEqual(address.target_layer, target_layer, 'admin_parts set correctly to ' + target_layer.join(', ')); t.deepEqual(layers, target_layer, 'admin_parts set correctly to ' + target_layer.join(', '));
} }
t.end(); t.end();
@ -114,7 +117,7 @@ module.exports.tests.parse_address = function(test, common) {
// remove leading whitespace // remove leading whitespace
query_string = query_string.substring(1); query_string = query_string.substring(1);
var address = parser(query_string); var address = parser.get_parsed_address(query_string);
var non_address_layer = get_layers(['admin', 'poi']); var non_address_layer = get_layers(['admin', 'poi']);
t.equal(typeof address, 'object', 'valid object for the address ('+query_string+')'); t.equal(typeof address, 'object', 'valid object for the address ('+query_string+')');

6
test/unit/query/search.js

@ -274,7 +274,7 @@ module.exports.tests.query = function(test, common) {
'locality', 'local_admin', 'osmaddress', 'openaddresses' ], 'locality', 'local_admin', 'osmaddress', 'openaddresses' ],
size: 10, size: 10,
details: true, details: true,
parsed_input: parser(address), parsed_input: parser.get_parsed_address(address),
default_layers_set: true default_layers_set: true
}); });
@ -476,7 +476,7 @@ module.exports.tests.query = function(test, common) {
'locality', 'local_admin', 'osmaddress', 'openaddresses' ], 'locality', 'local_admin', 'osmaddress', 'openaddresses' ],
size: 10, size: 10,
details: true, details: true,
parsed_input: parser(partial_address), parsed_input: parser.get_parsed_address(partial_address),
default_layers_set: true default_layers_set: true
}); });
@ -644,7 +644,7 @@ module.exports.tests.query = function(test, common) {
'locality', 'local_admin', 'osmaddress', 'openaddresses' ], 'locality', 'local_admin', 'osmaddress', 'openaddresses' ],
size: 10, size: 10,
details: true, details: true,
parsed_input: parser(partial_address), parsed_input: parser.get_parsed_address(partial_address),
default_layers_set: true default_layers_set: true
}); });

2
test/unit/sanitiser/_input.js

@ -17,7 +17,7 @@ var input = require('../../../sanitiser/_input'),
lon:0 lon:0
}, },
getTargetLayers = function(query) { getTargetLayers = function(query) {
var address = parser(query); var address = parser.get_parsed_address(query);
return address.target_layer; return address.target_layer;
}; };

7
test/unit/sanitiser/search.js

@ -11,6 +11,8 @@ var search = require('../../../sanitiser/search'),
types: { types: {
from_layers: [ 'geoname', 'osmnode', 'osmway', 'admin0', 'admin1', 'admin2', 'neighborhood', from_layers: [ 'geoname', 'osmnode', 'osmway', 'admin0', 'admin1', 'admin2', 'neighborhood',
'locality', 'local_admin', 'osmaddress', 'openaddresses' ], 'locality', 'local_admin', 'osmaddress', 'openaddresses' ],
from_address_parsing: [ 'geoname', 'osmnode', 'osmway', 'admin0', 'admin1', 'admin2', 'neighborhood',
'locality', 'local_admin', 'osmaddress', 'openaddresses' ],
}, },
size: 10, size: 10,
details: true, details: true,
@ -79,7 +81,7 @@ module.exports.tests.sanitize_input_with_delim = function(test, common) {
var expected = JSON.parse(JSON.stringify( defaultClean )); var expected = JSON.parse(JSON.stringify( defaultClean ));
expected.input = input; expected.input = input;
expected.parsed_input = parser(input); expected.parsed_input = parser.get_parsed_address(input);
t.equal(err, undefined, 'no error'); t.equal(err, undefined, 'no error');
t.equal(clean.parsed_input.name, expected.parsed_input.name, 'clean name set correctly'); t.equal(clean.parsed_input.name, expected.parsed_input.name, 'clean name set correctly');
@ -330,7 +332,8 @@ module.exports.tests.sanitize_layers = function(test, common) {
test('address (alias) layer', function(t) { test('address (alias) layer', function(t) {
var address_layers = ['osmaddress','openaddresses']; var address_layers = ['osmaddress','openaddresses'];
sanitize({ layers: 'address', input: 'test' }, function( err, clean ){ sanitize({ layers: 'address', input: 'test' }, function( err, clean ){
t.deepEqual(clean.types.from_layers, address_layers, 'address layers set'); t.deepEqual(clean.types.from_layers, address_layers, 'types from layers set');
t.deepEqual(clean.types.from_address_parser, _input.allLayers, 'address parser uses default layers');
t.end(); t.end();
}); });
}); });

Loading…
Cancel
Save