Browse Source

Separate concerns of address parser

The address parser currently does two things:
1.) make some intelligent guesses as to possible admin regions to
explicitly search against to improve the quality of results returned
2.) make some intelligent guesses as to when no part of the query needs
to search against anything other than admin regions. This somewhat
improves the quality of results returned but mostly improves the speed
of the Elasticsearch query since it's searching significantly fewer
recoords.

These two concerns are now split into two separate methods within the
query_parser helper module. They are mostly independent today, but don't
have to be in the future.
pull/221/head
Julian Simioni 9 years ago
parent
commit
7a6ac8541b
  1. 44
      helper/query_parser.js
  2. 5
      sanitiser/_input.js
  3. 17
      test/unit/helper/query_parser.js
  4. 6
      test/unit/query/search.js
  5. 2
      test/unit/sanitiser/_input.js
  6. 7
      test/unit/sanitiser/search.js

44
helper/query_parser.js

@ -1,10 +1,25 @@
var parser = require('addressit');
var extend = require('extend');
var get_layers = require('../helper/layers');
var get_layers_helper = require('../helper/layers');
var delim = ',';
module.exports = function(query) {
module.exports = {};
module.exports.get_layers = function get_layers(query) {
var tokenized = query.split(/[ ,]+/);
var hasNumber = /\d/.test(query);
if (query.length <= 3 ) {
// no address parsing required
return get_layers_helper(['admin']);
} else if (tokenized.length === 1 || (tokenized.length < 3 && !hasNumber)) {
// no need to hit address layers if there's only one (or two) token(s)
return get_layers_helper(['admin', 'poi']);
}
};
module.exports.get_parsed_address = function get_parsed_address(query) {
var tokenized = query.split(/[ ,]+/);
var hasNumber = /\d/.test(query);
@ -22,32 +37,18 @@ module.exports = function(query) {
return address;
};
var getTargetLayersWhenAddressParsingIsNotNecessary = function(query) {
var address = {};
// set target_layer if input length <= 3 characters
if (query.length <= 3 ) {
// no address parsing required
address.target_layer = get_layers(['admin']);
} else if (tokenized.length === 1 || (tokenized.length < 3 && !hasNumber)) {
// no need to hit address layers if there's only one (or two) token(s)
address.target_layer = get_layers(['admin', 'poi']);
}
return address.target_layer ? address : null;
};
var getAddressParts = function(query) {
// perform full address parsing
// except on queries so short they obviously can't contain an address
return parser( query );
if (query.length > 3) {
return parser( query );
}
};
var addressWithAdminParts = getAdminPartsBySplittingOnDelim(query);
var addressWithTargetLayers= getTargetLayersWhenAddressParsingIsNotNecessary(query);
var addressWithAddressParts= !addressWithTargetLayers ? getAddressParts(query) : {};
var addressWithAddressParts= getAddressParts(query);
var parsedAddress = extend(addressWithAdminParts,
addressWithTargetLayers,
addressWithAddressParts);
var address_parts = [ 'name',
@ -58,8 +59,7 @@ module.exports = function(query) {
'country',
'postalcode',
'regions',
'admin_parts',
'target_layer'
'admin_parts'
];
var parsed_input = {};

5
sanitiser/_input.js

@ -21,7 +21,10 @@ function sanitize( req ){
req.clean.input = params.input;
req.clean.parsed_input = query_parser(params.input);
req.clean.parsed_input = query_parser.get_parsed_address(params.input);
req.clean.types = req.clean.layers || {};
req.clean.types.from_address_parsing = query_parser.get_layers(params.input);
return { 'error': false };
}

17
test/unit/helper/query_parser.js

@ -6,7 +6,8 @@ module.exports.tests = {};
module.exports.tests.interface = function(test, common) {
test('interface', function(t) {
t.equal(typeof parser, 'function', 'valid function');
t.equal(typeof parser.get_parsed_address, 'function', 'valid function');
t.equal(typeof parser.get_layers, 'function', 'valid function');
t.end();
});
};
@ -17,7 +18,7 @@ module.exports.tests.split_on_comma = function(test, common) {
var testParse = function(query) {
test('naive parsing ' + query, function(t) {
var address = parser(query);
var address = parser.get_parsed_address(query);
var delimIndex = query.indexOf(delim);
var name = query.substring(0, delimIndex);
var admin_parts = query.substring(delimIndex + 1).trim();
@ -41,11 +42,12 @@ module.exports.tests.parse_three_chars_or_less = function(test, common) {
var testParse = function(query) {
test('query length < 3 (' + query + ')', function(t) {
var address = parser(query);
var address = parser.get_parsed_address(query);
var target_layer = get_layers(['admin']);
var layers = parser.get_layers(query);
t.equal(typeof address, 'object', 'valid object');
t.deepEqual(address.target_layer, target_layer, 'admin_parts set correctly to ' + target_layer.join(', '));
t.deepEqual(layers, target_layer, 'admin_parts set correctly to ' + target_layer.join(', '));
t.end();
});
};
@ -63,15 +65,16 @@ module.exports.tests.parse_one_or_more_tokens = function(test, common) {
var testParse = function(query, parse_address) {
test('query with one or more tokens (' + query + ')', function(t) {
var address = parser(query);
var address = parser.get_parsed_address(query);
var target_layer = get_layers(['admin', 'poi']);
var layers = parser.get_layers(query);
t.equal(typeof address, 'object', 'valid object');
if (parse_address) {
t.deepEqual(address.regions.join(''), query, 'since query contained a number, it went through address parsing');
} else {
t.deepEqual(address.target_layer, target_layer, 'admin_parts set correctly to ' + target_layer.join(', '));
t.deepEqual(layers, target_layer, 'admin_parts set correctly to ' + target_layer.join(', '));
}
t.end();
@ -114,7 +117,7 @@ module.exports.tests.parse_address = function(test, common) {
// remove leading whitespace
query_string = query_string.substring(1);
var address = parser(query_string);
var address = parser.get_parsed_address(query_string);
var non_address_layer = get_layers(['admin', 'poi']);
t.equal(typeof address, 'object', 'valid object for the address ('+query_string+')');

6
test/unit/query/search.js

@ -274,7 +274,7 @@ module.exports.tests.query = function(test, common) {
'locality', 'local_admin', 'osmaddress', 'openaddresses' ],
size: 10,
details: true,
parsed_input: parser(address),
parsed_input: parser.get_parsed_address(address),
default_layers_set: true
});
@ -476,7 +476,7 @@ module.exports.tests.query = function(test, common) {
'locality', 'local_admin', 'osmaddress', 'openaddresses' ],
size: 10,
details: true,
parsed_input: parser(partial_address),
parsed_input: parser.get_parsed_address(partial_address),
default_layers_set: true
});
@ -644,7 +644,7 @@ module.exports.tests.query = function(test, common) {
'locality', 'local_admin', 'osmaddress', 'openaddresses' ],
size: 10,
details: true,
parsed_input: parser(partial_address),
parsed_input: parser.get_parsed_address(partial_address),
default_layers_set: true
});

2
test/unit/sanitiser/_input.js

@ -17,7 +17,7 @@ var input = require('../../../sanitiser/_input'),
lon:0
},
getTargetLayers = function(query) {
var address = parser(query);
var address = parser.get_parsed_address(query);
return address.target_layer;
};

7
test/unit/sanitiser/search.js

@ -11,6 +11,8 @@ var search = require('../../../sanitiser/search'),
types: {
from_layers: [ 'geoname', 'osmnode', 'osmway', 'admin0', 'admin1', 'admin2', 'neighborhood',
'locality', 'local_admin', 'osmaddress', 'openaddresses' ],
from_address_parsing: [ 'geoname', 'osmnode', 'osmway', 'admin0', 'admin1', 'admin2', 'neighborhood',
'locality', 'local_admin', 'osmaddress', 'openaddresses' ],
},
size: 10,
details: true,
@ -79,7 +81,7 @@ module.exports.tests.sanitize_input_with_delim = function(test, common) {
var expected = JSON.parse(JSON.stringify( defaultClean ));
expected.input = input;
expected.parsed_input = parser(input);
expected.parsed_input = parser.get_parsed_address(input);
t.equal(err, undefined, 'no error');
t.equal(clean.parsed_input.name, expected.parsed_input.name, 'clean name set correctly');
@ -330,7 +332,8 @@ module.exports.tests.sanitize_layers = function(test, common) {
test('address (alias) layer', function(t) {
var address_layers = ['osmaddress','openaddresses'];
sanitize({ layers: 'address', input: 'test' }, function( err, clean ){
t.deepEqual(clean.types.from_layers, address_layers, 'address layers set');
t.deepEqual(clean.types.from_layers, address_layers, 'types from layers set');
t.deepEqual(clean.types.from_address_parser, _input.allLayers, 'address parser uses default layers');
t.end();
});
});

Loading…
Cancel
Save