Browse Source

Merge branch 'master' of github.com:pelias/api into missinglink

pull/529/head
missinglink 9 years ago
parent
commit
3051885119
  1. 27
      helper/labelGenerator.js
  2. 86
      helper/text_parser.js
  3. 3
      helper/type_mapping.js
  4. 2
      middleware/options.js
  5. 6
      package.json
  6. 4
      query/search_defaults.js
  7. 11
      query/text_parser.js
  8. 4
      sanitiser/_text.js
  9. 1
      test/ciao/autocomplete/layers_alias_coarse.coffee
  10. 2
      test/ciao/autocomplete/layers_invalid.coffee
  11. 2
      test/ciao/autocomplete/layers_mix_invalid_valid.coffee
  12. 1
      test/ciao/reverse/layers_alias_coarse.coffee
  13. 2
      test/ciao/reverse/layers_invalid.coffee
  14. 2
      test/ciao/reverse/layers_mix_invalid_valid.coffee
  15. 1
      test/ciao/search/layers_alias_coarse.coffee
  16. 2
      test/ciao/search/layers_invalid.coffee
  17. 2
      test/ciao/search/layers_mix_invalid_valid.coffee
  18. 11
      test/unit/helper/labelGenerator_examples.js
  19. 150
      test/unit/helper/text_parser.js
  20. 2
      test/unit/helper/type_mapping.js
  21. 1
      test/unit/query/autocomplete.js
  22. 8
      test/unit/query/search.js
  23. 1
      test/unit/run.js
  24. 10
      test/unit/sanitiser/_layers.js
  25. 4
      test/unit/sanitiser/search.js

27
helper/labelGenerator.js

@ -16,18 +16,27 @@ module.exports = function( record ){
// retain only things that are truthy
labelParts = _.compact(labelParts);
// first, dedupe the name and 1st label array elements
// this is used to ensure that the `name` and first admin hierarchy elements aren't repeated
// eg - `["Lancaster", "Lancaster", "PA", "United States"]` -> `["Lancaster", "PA", "United States"]`
var dedupedNameAndFirstLabelElement = _.uniq([labelParts.shift(), labelParts.shift()]);
// third, dedupe and join with a comma and return
return dedupeNameAndFirstLabelElement(labelParts).join(', ');
// second, unshift the deduped parts back onto the labelParts
labelParts.unshift.apply(labelParts, dedupedNameAndFirstLabelElement);
};
// third, join with a comma and return
return labelParts.join(', ');
function dedupeNameAndFirstLabelElement(labelParts) {
// only dedupe if a result has more than a name (the first label part)
if (labelParts.length > 1) {
// first, dedupe the name and 1st label array elements
// this is used to ensure that the `name` and first admin hierarchy elements aren't repeated
// eg - `["Lancaster", "Lancaster", "PA", "United States"]` -> `["Lancaster", "PA", "United States"]`
var deduped = _.uniq([labelParts.shift(), labelParts.shift()]);
};
// second, unshift the deduped parts back onto the labelParts
labelParts.unshift.apply(labelParts, deduped);
}
return labelParts;
}
function getSchema(country_a) {
if (country_a && country_a.length && schemas[country_a]) {

86
helper/text_parser.js

@ -1,86 +0,0 @@
var parser = require('addressit');
var extend = require('extend');
var type_mapping = require('../helper/type_mapping');
var check = require('check-types');
var logger = require('pelias-logger').get('api');
var DELIM = ',';
/*
* For performance, and to prefer POI and admin records, express a preference
* to only search coarse layers on very short text inputs.
*/
module.exports.get_layers = function get_layers(query) {
if (query.length <= 3 ) {
// no address parsing required
return type_mapping.layer_mapping.coarse;
}
};
module.exports.get_parsed_address = function get_parsed_address(query) {
var getAdminPartsBySplittingOnDelim = function(queryParts) {
// naive approach - for admin matching during query time
// split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny'
var address = {};
if (queryParts.length > 1) {
address.name = queryParts[0].trim();
// 1. slice away all parts after the first one
// 2. trim spaces from each part just in case
// 3. join the parts back together with appropriate delimiter and spacing
address.admin_parts = queryParts.slice(1)
.map(function (part) { return part.trim(); })
.join(DELIM + ' ');
}
return address;
};
var getAddressParts = function(query) {
// perform full address parsing
// except on queries so short they obviously can't contain an address
if (query.length > 3) {
return parser( query );
}
};
var queryParts = query.split(DELIM);
var addressWithAdminParts = getAdminPartsBySplittingOnDelim(queryParts);
var addressWithAddressParts= getAddressParts(queryParts.join(DELIM + ' '));
var parsedAddress = extend(addressWithAdminParts,
addressWithAddressParts);
var address_parts = [ 'name',
'number',
'street',
'city',
'state',
'country',
'postalcode',
'regions',
'admin_parts'
];
var parsed_text = {};
address_parts.forEach(function(part){
if (parsedAddress[part]) {
parsed_text[part] = parsedAddress[part];
}
});
// if all we found was regions, ignore it as it is not enough information to make smarter decisions
if (Object.keys(parsed_text).length === 1 && !check.undefined(parsed_text.regions))
{
logger.info('Ignoring address parser output, regions only');
return null;
}
return parsed_text;
};

3
helper/type_mapping.js

@ -49,7 +49,8 @@ var LAYERS_BY_SOURCE = {
openaddresses: [ 'address' ],
geonames: [ 'country', 'region', 'county', 'locality', 'venue' ],
whosonfirst: [ 'continent', 'country', 'dependency', 'macroregion', 'region',
'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'neighbourhood', 'microhood', 'disputed']
'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'borough',
'neighbourhood', 'microhood', 'disputed']
};
/*

2
middleware/options.js

@ -9,7 +9,7 @@
function middleware(req, res, next){
if( req.method === 'OPTIONS' ){
res.send(200);
res.sendStatus(200);
} else {
next();
}

6
package.json

@ -35,7 +35,6 @@
"elasticsearch": ">=1.2.1"
},
"dependencies": {
"addressit": "git://github.com/dianashk/addressit.git#temp",
"async": "^1.5.2",
"check-types": "^6.0.0",
"cluster2": "git://github.com/missinglink/cluster2.git#node_zero_twelve",
@ -53,9 +52,10 @@
"morgan": "1.7.0",
"pelias-config": "^1.0.1",
"pelias-logger": "^0.0.8",
"pelias-model": "^3.1.0",
"pelias-query": "6.2.0",
"pelias-model": "^4.0.0",
"pelias-query": "6.3.0",
"pelias-suggester-pipeline": "2.0.4",
"pelias-text-analyzer": "^1.0.1",
"stats-lite": "1.0.3",
"through2": "2.0.1"
},

4
query/search_defaults.js

@ -78,6 +78,10 @@ module.exports = _.merge({}, peliasQuery.defaults, {
'admin:locality:field': 'parent.locality',
'admin:locality:boost': 1,
'admin:borough:analyzer': 'peliasAdmin',
'admin:borough:field': 'parent.borough',
'admin:borough:boost': 1,
'admin:neighbourhood:analyzer': 'peliasAdmin',
'admin:neighbourhood:field': 'parent.neighbourhood',
'admin:neighbourhood:boost': 1,

11
query/text_parser.js

@ -1,20 +1,15 @@
var logger = require('pelias-logger').get('api');
var placeTypes = require('../helper/placeTypes');
/*
This list should only contain admin fields we are comfortable matching in the case
when we can't identify parts of an address. This shouldn't contain fields like country_a
or postalcode because we should only try to match those when we're sure that's what they are.
*/
var adminFields = [
'country',
'region',
var adminFields = placeTypes.concat([
'region_a',
'county',
'localadmin',
'locality',
'neighbourhood'
];
]);
/**
@todo: refactor me

4
sanitiser/_text.js

@ -1,5 +1,5 @@
var check = require('check-types'),
text_parser = require('../helper/text_parser');
text_analyzer = require('pelias-text-analyzer');
// validate texts, convert types and apply defaults
function sanitize( raw, clean ){
@ -19,7 +19,7 @@ function sanitize( raw, clean ){
clean.text = raw.text;
// parse text with query parser
var parsed_text = text_parser.get_parsed_address(clean.text);
var parsed_text = text_analyzer.parse(clean.text);
if (check.assigned(parsed_text)) {
clean.parsed_text = parsed_text;
}

1
test/ciao/autocomplete/layers_alias_coarse.coffee

@ -41,6 +41,7 @@ json.geocoding.query.layers.should.eql [ "continent",
"macrocounty",
"county",
"macrohood",
"borough",
"neighbourhood",
"microhood",
"disputed"

2
test/ciao/autocomplete/layers_invalid.coffee

@ -24,7 +24,7 @@ json.features.should.be.instanceof Array
#? expected errors
should.exist json.geocoding.errors
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ]
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ]
#? expected warnings
should.not.exist json.geocoding.warnings

2
test/ciao/autocomplete/layers_mix_invalid_valid.coffee

@ -24,7 +24,7 @@ json.features.should.be.instanceof Array
#? expected errors
should.exist json.geocoding.errors
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ]
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ]
#? expected warnings
should.not.exist json.geocoding.warnings

1
test/ciao/reverse/layers_alias_coarse.coffee

@ -40,6 +40,7 @@ json.geocoding.query.layers.should.eql [ "continent",
"macrocounty",
"county",
"macrohood",
"borough",
"neighbourhood",
"microhood",
"disputed"

2
test/ciao/reverse/layers_invalid.coffee

@ -24,7 +24,7 @@ json.features.should.be.instanceof Array
#? expected errors
should.exist json.geocoding.errors
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ]
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ]
#? expected warnings
should.not.exist json.geocoding.warnings

2
test/ciao/reverse/layers_mix_invalid_valid.coffee

@ -24,7 +24,7 @@ json.features.should.be.instanceof Array
#? expected errors
should.exist json.geocoding.errors
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ]
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ]
#? expected warnings
should.not.exist json.geocoding.warnings

1
test/ciao/search/layers_alias_coarse.coffee

@ -41,6 +41,7 @@ json.geocoding.query.layers.should.eql [ "continent",
"macrocounty",
"county",
"macrohood",
"borough",
"neighbourhood",
"microhood",
"disputed"

2
test/ciao/search/layers_invalid.coffee

@ -24,7 +24,7 @@ json.features.should.be.instanceof Array
#? expected errors
should.exist json.geocoding.errors
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ]
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ]
#? expected warnings
should.not.exist json.geocoding.warnings

2
test/ciao/search/layers_mix_invalid_valid.coffee

@ -24,7 +24,7 @@ json.features.should.be.instanceof Array
#? expected errors
should.exist json.geocoding.errors
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ]
json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ]
#? expected warnings
should.not.exist json.geocoding.warnings

11
test/unit/helper/labelGenerator_examples.js

@ -104,6 +104,17 @@ module.exports.tests.france = function(test, common) {
};
module.exports.tests.name_only = function(test, common) {
test('name-only results (no admin fields) should not include extraneous comma', function(t) {
var doc = {
'name': 'Result name',
};
t.equal(generator(doc),'Result name');
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {

150
test/unit/helper/text_parser.js

@ -1,150 +0,0 @@
var parser = require('../../../helper/text_parser');
var type_mapping = require('../../../helper/type_mapping');
var layers_map = type_mapping.layer_mapping;
module.exports.tests = {};
module.exports.tests.interface = function(test, common) {
test('interface', function(t) {
t.equal(typeof parser.get_parsed_address, 'function', 'valid function');
t.equal(typeof parser.get_layers, 'function', 'valid function');
t.end();
});
};
module.exports.tests.split_on_comma = function(test, common) {
var queries = [
{ name: 'soho', admin_parts: 'new york' },
{ name: 'chelsea', admin_parts: 'london' },
{ name: '123 main', admin_parts: 'new york' }
];
queries.forEach(function (query) {
test('naive parsing ' + query, function(t) {
var address = parser.get_parsed_address(query.name + ', ' + query.admin_parts);
t.equal(typeof address, 'object', 'valid object');
t.equal(address.name, query.name, 'name set correctly to ' + address.name);
t.equal(address.admin_parts, query.admin_parts, 'admin_parts set correctly to ' + address.admin_parts);
t.end();
});
test('naive parsing ' + query + 'without spaces', function(t) {
var address = parser.get_parsed_address(query.name + ',' + query.admin_parts);
t.equal(typeof address, 'object', 'valid object');
t.equal(address.name, query.name, 'name set correctly to ' + address.name);
t.equal(address.admin_parts, query.admin_parts, 'admin_parts set correctly to ' + address.admin_parts);
t.end();
});
});
};
module.exports.tests.parse_three_chars_or_less = function(test, common) {
var chars_queries = ['a', 'bb', 'ccc'];
var num_queries = ['1', '12', '123'];
var alphanum_q = ['a1', '1a2', '12c'];
var queries = chars_queries.concat(num_queries).concat(alphanum_q);
queries.forEach(function(query) {
test('query length < 3 (' + query + ')', function(t) {
var address = parser.get_parsed_address(query);
var target_layer = layers_map.coarse;
var layers = parser.get_layers(query);
t.equal(typeof address, 'object', 'valid object');
t.deepEqual(layers, target_layer, 'admin_parts set correctly to ' + target_layer.join(', '));
t.end();
});
});
};
module.exports.tests.parse_one_token = function(test, common) {
test('query with one token', function (t) {
var address = parser.get_parsed_address('yugolsavia');
t.equal(address, null, 'nothing address specific detected');
t.end();
});
test('query with two tokens, no numbers', function (t) {
var address = parser.get_parsed_address('small town');
t.equal(address, null, 'nothing address specific detected');
t.end();
});
test('query with two tokens, number first', function (t) {
var address = parser.get_parsed_address('123 main');
t.equal(address, null, 'nothing address specific detected');
t.end();
});
test('query with two tokens, number second', function (t) {
var address = parser.get_parsed_address('main 123');
t.equal(address, null, 'nothing address specific detected');
t.end();
});
test('query with many tokens', function(t) {
var address = parser.get_parsed_address('main particle new york');
t.equal(address, null, 'nothing address specific detected');
t.end();
});
};
module.exports.tests.parse_address = function(test, common) {
test('valid address, house number', function(t) {
var query_string = '123 main st new york ny';
var address = parser.get_parsed_address(query_string);
t.equal(typeof address, 'object', 'valid object for the address');
t.equal(address.number, '123', 'parsed house number');
t.equal(address.street, 'main st', 'parsed street');
t.deepEqual(address.regions, ['new york'], 'parsed city');
t.equal(address.state , 'NY', 'parsed state');
t.end();
});
test('valid address, zipcode', function(t) {
var query_string = '123 main st new york ny 10010';
var address = parser.get_parsed_address(query_string);
t.equal(typeof address, 'object', 'valid object for the address');
t.equal(address.number, '123', 'parsed house number');
t.equal(address.street, 'main st', 'parsed street');
t.deepEqual(address.regions, ['new york'], 'parsed city');
t.equal(address.state , 'NY', 'parsed state');
t.equal(address.postalcode, '10010', 'parsed zip is a string');
t.end();
});
test('valid address with leading 0s in zipcode', function(t) {
var query_string = '339 W Main St, Cheshire, 06410';
var address = parser.get_parsed_address(query_string);
console.log(address);
t.equal(typeof address, 'object', 'valid object for the address');
t.equal(address.street, 'W Main St', 'parsed street');
t.deepEqual(address.regions, ['Cheshire'], 'parsed city');
t.equal(address.postalcode, '06410', 'parsed zip');
t.end();
});
test('valid address without spaces after commas', function(t) {
var query_string = '339 W Main St,Lancaster,PA';
var address = parser.get_parsed_address(query_string);
t.equal(typeof address, 'object', 'valid object for the address');
t.equal(address.number, '339', 'parsed house number');
t.equal(address.street, 'W Main St', 'parsed street');
t.deepEqual(address.regions, ['Lancaster'], 'parsed city');
t.deepEqual(address.state, 'PA', 'parsed state');
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('QUERY PARSING: ' + name, testFunction);
}
for( var testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};

2
test/unit/helper/type_mapping.js

@ -14,7 +14,7 @@ module.exports.tests.interfaces = function(test, common) {
t.deepEquals(type_mapping.layer_mapping.coarse,
[ 'continent', 'country', 'dependency', 'macroregion',
'region', 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood',
'neighbourhood', 'microhood', 'disputed' ]);
'borough', 'neighbourhood', 'microhood', 'disputed' ]);
t.end();
});

1
test/unit/query/autocomplete.js

@ -1,6 +1,5 @@
var generate = require('../../../query/autocomplete');
var parser = require('../../../helper/text_parser');
module.exports.tests = {};

8
test/unit/query/search.js

@ -1,5 +1,5 @@
var generate = require('../../../query/search');
var parser = require('../../../helper/text_parser');
var text_analyzer = require('pelias-text-analyzer');
module.exports.tests = {};
@ -128,7 +128,7 @@ module.exports.tests.query = function(test, common) {
var query = generate({ text: address,
layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ],
querySize: 10,
parsed_text: parser.get_parsed_address(address),
parsed_text: text_analyzer.parse(address),
});
var compiled = JSON.parse( JSON.stringify( query ) );
@ -143,7 +143,7 @@ module.exports.tests.query = function(test, common) {
var query = generate({ text: partial_address,
layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ],
querySize: 10,
parsed_text: parser.get_parsed_address(partial_address),
parsed_text: text_analyzer.parse(partial_address),
});
var compiled = JSON.parse( JSON.stringify( query ) );
@ -158,7 +158,7 @@ module.exports.tests.query = function(test, common) {
var query = generate({ text: partial_address,
layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ],
querySize: 10,
parsed_text: parser.get_parsed_address(partial_address),
parsed_text: text_analyzer.parse(partial_address),
});
var compiled = JSON.parse( JSON.stringify( query ) );

1
test/unit/run.js

@ -19,7 +19,6 @@ var tests = [
require('./helper/labelGenerator_GBR'),
require('./helper/labelGenerator_USA'),
require('./helper/labelSchema'),
require('./helper/text_parser'),
require('./helper/type_mapping'),
require('./helper/sizeCalculator'),
require('./middleware/confidenceScore'),

10
test/unit/sanitiser/_layers.js

@ -42,8 +42,8 @@ module.exports.tests.sanitize_layers = function(test, common) {
sanitize(raw, clean);
var admin_layers = [ 'continent', 'country', 'dependency',
'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'neighbourhood',
'microhood', 'disputed' ];
'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county',
'macrohood', 'borough', 'neighbourhood', 'microhood', 'disputed' ];
t.deepEqual(clean.layers, admin_layers, 'coarse layers set');
t.end();
@ -77,8 +77,8 @@ module.exports.tests.sanitize_layers = function(test, common) {
sanitize(raw, clean);
var expected_layers = [ 'continent', 'country', 'dependency',
'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'neighbourhood',
'microhood', 'disputed' ];
'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county',
'macrohood', 'borough', 'neighbourhood', 'microhood', 'disputed' ];
t.deepEqual(clean.layers, expected_layers, 'coarse + regular layers set');
t.end();
@ -114,7 +114,7 @@ module.exports.tests.sanitize_layers = function(test, common) {
var coarse_layers = [ 'continent',
'country', 'dependency', 'macroregion', 'region', 'locality', 'localadmin',
'macrocounty', 'county', 'macrohood', 'neighbourhood', 'microhood',
'macrocounty', 'county', 'macrohood', 'borough', 'neighbourhood', 'microhood',
'disputed' ];
var venue_layers = [ 'venue' ];

4
test/unit/sanitiser/search.js

@ -1,6 +1,6 @@
var extend = require('extend'),
search = require('../../../sanitiser/search'),
parser = require('../../../helper/text_parser'),
text_analyzer = require('pelias-text-analyzer'),
sanitize = search.sanitize,
middleware = search.middleware,
defaultError = 'invalid param \'text\': text length, must be >0';
@ -80,7 +80,7 @@ module.exports.tests.sanitize_text_with_delim = function(test, common) {
sanitize( req, function( ){
var expected_text = text;
var expected_parsed_text = parser.get_parsed_address(text);
var expected_parsed_text = text_analyzer.parse(text);
t.equal(req.errors[0], undefined, 'no error');
t.equal(req.clean.parsed_text.name, expected_parsed_text.name, 'clean name set correctly');
t.equal(req.clean.text, expected_text, 'text should match');

Loading…
Cancel
Save