Browse Source

Merge pull request #736 from pelias/use-libpostal-to-parse-address-field

Use libpostal to parse address field
pull/739/merge
Stephen K Hess 8 years ago committed by GitHub
parent
commit
2b0fa561eb
  1. 4
      middleware/trimByGranularityComponent.js
  2. 2
      package.json
  3. 42
      sanitizer/_synthesize_analysis.js
  4. 193
      test/unit/fixture/component_geocoding/fallback.json
  5. 24
      test/unit/middleware/trimByGranularityComponent.js
  6. 9
      test/unit/query/component_geocoding.js
  7. 124
      test/unit/sanitizer/_synthesize_analysis.js

4
middleware/trimByGranularityComponent.js

@ -22,8 +22,8 @@ const _ = require('lodash');
// supplied we want to retain borough=Manhattan and city=Manhattan results // supplied we want to retain borough=Manhattan and city=Manhattan results
const layers = [ const layers = [
'venue', 'venue',
'street',
'address', 'address',
'street',
'neighbourhood', 'neighbourhood',
['borough', 'locality'], ['borough', 'locality'],
'localadmin', 'localadmin',
@ -44,8 +44,8 @@ const layers = [
// city=Manhattan // city=Manhattan
const explicit_borough_layers = [ const explicit_borough_layers = [
'venue', 'venue',
'street',
'address', 'address',
'street',
'neighbourhood', 'neighbourhood',
'borough', 'borough',
'locality', 'locality',

2
package.json

@ -56,7 +56,7 @@
"pelias-labels": "1.5.1", "pelias-labels": "1.5.1",
"pelias-logger": "0.1.0", "pelias-logger": "0.1.0",
"pelias-model": "4.3.0", "pelias-model": "4.3.0",
"pelias-query": "8.9.0", "pelias-query": "8.10.0",
"pelias-text-analyzer": "1.6.0", "pelias-text-analyzer": "1.6.0",
"stats-lite": "2.0.3", "stats-lite": "2.0.3",
"through2": "2.0.1" "through2": "2.0.1"

42
sanitizer/_synthesize_analysis.js

@ -1,4 +1,5 @@
const _ = require('lodash'); const _ = require('lodash');
const text_analyzer = require('pelias-text-analyzer');
const fields = { const fields = {
'address': 'address', 'address': 'address',
@ -20,6 +21,19 @@ function isPostalCodeOnly(parsed_text) {
parsed_text.hasOwnProperty('postalcode'); parsed_text.hasOwnProperty('postalcode');
} }
// figure out which field contains the probable house number, prefer number
// libpostal parses some inputs, like `3370 cobbe ave`, as a postcode+street
// so because we're treating the entire field as a street address, it's safe
// to assume that an identified postcode is actually a house number.
function getHouseNumberField(analyzed_address) {
for (var field of ['number', 'postalcode']) {
if (analyzed_address.hasOwnProperty(field)) {
return field;
}
}
}
function sanitize( raw, clean ){ function sanitize( raw, clean ){
// error & warning messages // error & warning messages
@ -43,6 +57,34 @@ function sanitize( raw, clean ){
`at least one of the following fields is required: ${Object.keys(fields).join(', ')}`); `at least one of the following fields is required: ${Object.keys(fields).join(', ')}`);
} }
if (clean.parsed_text.hasOwnProperty('address')) {
const analyzed_address = text_analyzer.parse(clean.parsed_text.address);
const house_number_field = getHouseNumberField(analyzed_address);
// if we're fairly certain that libpostal identified a house number
// (from either the house_number or postcode field), place it into the
// number field and remove the first instance of that value from address
// and assign to street
// eg - '1090 N Charlotte St' becomes number=1090 and street=N Charlotte St
if (house_number_field) {
clean.parsed_text.number = analyzed_address[house_number_field];
// remove the first instance of the number and trim whitespace
clean.parsed_text.street = _.trim(_.replace(clean.parsed_text.address, clean.parsed_text.number, ''));
} else {
// otherwise no house number was identifiable, so treat the entire input
// as a street
clean.parsed_text.street = clean.parsed_text.address;
}
// the address field no longer means anything since it's been parsed, so remove it
delete clean.parsed_text.address;
}
return messages; return messages;
} }

193
test/unit/fixture/component_geocoding/fallback.json

@ -6,6 +6,197 @@
"query": { "query": {
"bool": { "bool": {
"should": [ "should": [
{
"bool": {
"_name": "fallback.address",
"must": [
{
"match_phrase": {
"address_parts.number": "number value"
}
},
{
"match_phrase": {
"address_parts.street": "street value"
}
},
{
"multi_match": {
"query": "neighbourhood value",
"type": "phrase",
"fields": [
"parent.neighbourhood",
"parent.neighbourhood_a"
]
}
},
{
"multi_match": {
"query": "borough value",
"type": "phrase",
"fields": [
"parent.borough",
"parent.borough_a"
]
}
},
{
"multi_match": {
"query": "city value",
"type": "phrase",
"fields": [
"parent.locality",
"parent.locality_a",
"parent.localadmin",
"parent.localadmin_a"
]
}
},
{
"multi_match": {
"query": "county value",
"type": "phrase",
"fields": [
"parent.county",
"parent.county_a",
"parent.macrocounty",
"parent.macrocounty_a"
]
}
},
{
"multi_match": {
"query": "state value",
"type": "phrase",
"fields": [
"parent.region",
"parent.region_a",
"parent.macroregion",
"parent.macroregion_a"
]
}
},
{
"multi_match": {
"query": "country value",
"type": "phrase",
"fields": [
"parent.country",
"parent.country_a",
"parent.dependency",
"parent.dependency_a"
]
}
}
],
"should": [
{
"match_phrase": {
"address_parts.zip": "postalcode value"
}
}
],
"filter": {
"term": {
"layer": "address"
}
},
"boost": 50
}
},
{
"bool": {
"_name": "fallback.street",
"must": [
{
"match_phrase": {
"address_parts.street": "street value"
}
},
{
"multi_match": {
"query": "neighbourhood value",
"type": "phrase",
"fields": [
"parent.neighbourhood",
"parent.neighbourhood_a"
]
}
},
{
"multi_match": {
"query": "borough value",
"type": "phrase",
"fields": [
"parent.borough",
"parent.borough_a"
]
}
},
{
"multi_match": {
"query": "city value",
"type": "phrase",
"fields": [
"parent.locality",
"parent.locality_a",
"parent.localadmin",
"parent.localadmin_a"
]
}
},
{
"multi_match": {
"query": "county value",
"type": "phrase",
"fields": [
"parent.county",
"parent.county_a",
"parent.macrocounty",
"parent.macrocounty_a"
]
}
},
{
"multi_match": {
"query": "state value",
"type": "phrase",
"fields": [
"parent.region",
"parent.region_a",
"parent.macroregion",
"parent.macroregion_a"
]
}
},
{
"multi_match": {
"query": "country value",
"type": "phrase",
"fields": [
"parent.country",
"parent.country_a",
"parent.dependency",
"parent.dependency_a"
]
}
}
],
"should": [
{
"match_phrase": {
"address_parts.zip": "postalcode value"
}
}
],
"filter": {
"term": {
"layer": "street"
}
},
"boost": 100
}
},
{ {
"bool": { "bool": {
"_name": "fallback.neighbourhood", "_name": "fallback.neighbourhood",
@ -514,4 +705,4 @@
], ],
"size": 20, "size": 20,
"track_scores": true "track_scores": true
} }

24
test/unit/middleware/trimByGranularityComponent.js

@ -19,8 +19,8 @@ module.exports.tests.trimByGranularity = function(test, common) {
data: [ data: [
{ name: 'venue 1', _matched_queries: ['fallback.venue'] }, { name: 'venue 1', _matched_queries: ['fallback.venue'] },
{ name: 'venue 2', _matched_queries: ['fallback.venue'] }, { name: 'venue 2', _matched_queries: ['fallback.venue'] },
{ name: 'street 1', _matched_queries: ['fallback.street'] },
{ name: 'address 1', _matched_queries: ['fallback.address'] }, { name: 'address 1', _matched_queries: ['fallback.address'] },
{ name: 'street 1', _matched_queries: ['fallback.street'] },
{ name: 'neighbourhood 1', _matched_queries: ['fallback.neighbourhood'] }, { name: 'neighbourhood 1', _matched_queries: ['fallback.neighbourhood'] },
{ name: 'borough 1', _matched_queries: ['fallback.borough'] }, { name: 'borough 1', _matched_queries: ['fallback.borough'] },
{ name: 'locality 1', _matched_queries: ['fallback.locality'] }, { name: 'locality 1', _matched_queries: ['fallback.locality'] },
@ -55,9 +55,9 @@ module.exports.tests.trimByGranularity = function(test, common) {
var res = { var res = {
data: [ data: [
{ name: 'street 1', _matched_queries: ['fallback.street'] },
{ name: 'street 2', _matched_queries: ['fallback.street'] },
{ name: 'address 1', _matched_queries: ['fallback.address'] }, { name: 'address 1', _matched_queries: ['fallback.address'] },
{ name: 'address 2', _matched_queries: ['fallback.address'] },
{ name: 'street 1', _matched_queries: ['fallback.street'] },
{ name: 'neighbourhood 1', _matched_queries: ['fallback.neighbourhood'] }, { name: 'neighbourhood 1', _matched_queries: ['fallback.neighbourhood'] },
{ name: 'borough 1', _matched_queries: ['fallback.borough'] }, { name: 'borough 1', _matched_queries: ['fallback.borough'] },
{ name: 'locality 1', _matched_queries: ['fallback.locality'] }, { name: 'locality 1', _matched_queries: ['fallback.locality'] },
@ -73,13 +73,13 @@ module.exports.tests.trimByGranularity = function(test, common) {
}; };
var expected_data = [ var expected_data = [
{ name: 'street 1', _matched_queries: ['fallback.street'] }, { name: 'address 1', _matched_queries: ['fallback.address'] },
{ name: 'street 2', _matched_queries: ['fallback.street'] } { name: 'address 2', _matched_queries: ['fallback.address'] }
]; ];
function testIt() { function testIt() {
trimByGranularity(req, res, function() { trimByGranularity(req, res, function() {
t.deepEquals(res.data, expected_data, 'only street records should be here'); t.deepEquals(res.data, expected_data, 'only address records should be here');
t.end(); t.end();
}); });
} }
@ -87,13 +87,13 @@ module.exports.tests.trimByGranularity = function(test, common) {
testIt(); testIt();
}); });
test('all records with fallback.* matched_queries name should retain only address when they are most granular', function(t) { test('all records with fallback.* matched_queries name should retain only street when they are most granular', function(t) {
var req = { clean: {} }; var req = { clean: {} };
var res = { var res = {
data: [ data: [
{ name: 'address 1', _matched_queries: ['fallback.address'] }, { name: 'street 1', _matched_queries: ['fallback.street'] },
{ name: 'address 2', _matched_queries: ['fallback.address'] }, { name: 'street 2', _matched_queries: ['fallback.street'] },
{ name: 'neighbourhood 1', _matched_queries: ['fallback.neighbourhood'] }, { name: 'neighbourhood 1', _matched_queries: ['fallback.neighbourhood'] },
{ name: 'borough 1', _matched_queries: ['fallback.borough'] }, { name: 'borough 1', _matched_queries: ['fallback.borough'] },
{ name: 'locality 1', _matched_queries: ['fallback.locality'] }, { name: 'locality 1', _matched_queries: ['fallback.locality'] },
@ -109,13 +109,13 @@ module.exports.tests.trimByGranularity = function(test, common) {
}; };
var expected_data = [ var expected_data = [
{ name: 'address 1', _matched_queries: ['fallback.address'] }, { name: 'street 1', _matched_queries: ['fallback.street'] },
{ name: 'address 2', _matched_queries: ['fallback.address'] }, { name: 'street 2', _matched_queries: ['fallback.street'] },
]; ];
function testIt() { function testIt() {
trimByGranularity(req, res, function() { trimByGranularity(req, res, function() {
t.deepEquals(res.data, expected_data, 'only address records should be here'); t.deepEquals(res.data, expected_data, 'only street records should be here');
t.end(); t.end();
}); });
} }

9
test/unit/query/component_geocoding.js

@ -14,7 +14,6 @@ module.exports.tests.query = function(test, common) {
test('valid search + focus + bbox', function(t) { test('valid search + focus + bbox', function(t) {
var clean = { var clean = {
parsed_text: { parsed_text: {
street: 'street value'
}, },
text: 'test', text: 'test',
querySize: 10, querySize: 10,
@ -39,7 +38,6 @@ module.exports.tests.query = function(test, common) {
test('valid search + bbox', function(t) { test('valid search + bbox', function(t) {
var clean = { var clean = {
parsed_text: { parsed_text: {
street: 'street value'
}, },
text: 'test', text: 'test',
querySize: 10, querySize: 10,
@ -63,7 +61,6 @@ module.exports.tests.query = function(test, common) {
test('valid lingustic-only search', function(t) { test('valid lingustic-only search', function(t) {
var clean = { var clean = {
parsed_text: { parsed_text: {
street: 'street value'
}, },
text: 'test', querySize: 10, text: 'test', querySize: 10,
layers: ['test'] layers: ['test']
@ -82,7 +79,6 @@ module.exports.tests.query = function(test, common) {
test('search search + focus', function(t) { test('search search + focus', function(t) {
var clean = { var clean = {
parsed_text: { parsed_text: {
street: 'street value'
}, },
text: 'test', querySize: 10, text: 'test', querySize: 10,
'focus.point.lat': 29.49136, 'focus.point.lon': -82.50622, 'focus.point.lat': 29.49136, 'focus.point.lon': -82.50622,
@ -102,7 +98,6 @@ module.exports.tests.query = function(test, common) {
test('search search + viewport', function(t) { test('search search + viewport', function(t) {
var clean = { var clean = {
parsed_text: { parsed_text: {
street: 'street value'
}, },
text: 'test', querySize: 10, text: 'test', querySize: 10,
'focus.viewport.min_lat': 28.49136, 'focus.viewport.min_lat': 28.49136,
@ -127,7 +122,6 @@ module.exports.tests.query = function(test, common) {
test('search with viewport diagonal < 1km should set scale to 1km', function(t) { test('search with viewport diagonal < 1km should set scale to 1km', function(t) {
var clean = { var clean = {
parsed_text: { parsed_text: {
street: 'street value'
}, },
text: 'test', querySize: 10, text: 'test', querySize: 10,
'focus.viewport.min_lat': 28.49135, 'focus.viewport.min_lat': 28.49135,
@ -150,7 +144,6 @@ module.exports.tests.query = function(test, common) {
test('search search + focus on null island', function(t) { test('search search + focus on null island', function(t) {
var clean = { var clean = {
parsed_text: { parsed_text: {
street: 'street value'
}, },
text: 'test', querySize: 10, text: 'test', querySize: 10,
'focus.point.lat': 0, 'focus.point.lon': 0, 'focus.point.lat': 0, 'focus.point.lon': 0,
@ -198,7 +191,6 @@ module.exports.tests.query = function(test, common) {
test('valid boundary.country search', function(t) { test('valid boundary.country search', function(t) {
var clean = { var clean = {
parsed_text: { parsed_text: {
street: 'street value'
}, },
text: 'test', querySize: 10, text: 'test', querySize: 10,
layers: ['test'], layers: ['test'],
@ -218,7 +210,6 @@ module.exports.tests.query = function(test, common) {
test('valid sources filter', function(t) { test('valid sources filter', function(t) {
var clean = { var clean = {
parsed_text: { parsed_text: {
street: 'street value'
}, },
'text': 'test', 'text': 'test',
'sources': ['test_source'] 'sources': ['test_source']

124
test/unit/sanitizer/_synthesize_analysis.js

@ -1,13 +1,18 @@
const sanitizer = require('../../../sanitizer/_synthesize_analysis');
const _ = require('lodash'); const _ = require('lodash');
const proxyquire = require('proxyquire').noCallThru();
module.exports.tests = {}; module.exports.tests = {};
module.exports.tests.text_parser = function(test, common) { module.exports.tests.text_parser = function(test, common) {
test('all variables should be parsed', function(t) { test('all variables should be parsed', function(t) {
var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', {
'pelias-text-analyzer': { parse: function(query) {
t.fail('parse should not have been called');
}
}});
const raw = { const raw = {
query: ' \t query \t value \t ', query: ' \t query \t value \t ',
address: ' \t address \t value \t ',
neighbourhood: ' \t neighbourhood \t value \t ', neighbourhood: ' \t neighbourhood \t value \t ',
borough: ' \t borough \t value \t ', borough: ' \t borough \t value \t ',
locality: ' \t locality \t value \t ', locality: ' \t locality \t value \t ',
@ -21,7 +26,6 @@ module.exports.tests.text_parser = function(test, common) {
const expected_clean = { const expected_clean = {
parsed_text: { parsed_text: {
address: 'address value',
neighbourhood: 'neighbourhood value', neighbourhood: 'neighbourhood value',
borough: 'borough value', borough: 'borough value',
city: 'locality value', city: 'locality value',
@ -42,6 +46,12 @@ module.exports.tests.text_parser = function(test, common) {
}); });
test('non-string and blank string values should be treated as not supplied', function(t) { test('non-string and blank string values should be treated as not supplied', function(t) {
var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', {
'pelias-text-analyzer': { parse: function(query) {
t.fail('parse should not have been called');
}
}});
// helper to return a random value that's considered invalid // helper to return a random value that's considered invalid
function getInvalidValue() { function getInvalidValue() {
return _.sample([{}, [], false, '', ' \t ', 17, undefined]); return _.sample([{}, [], false, '', ' \t ', 17, undefined]);
@ -75,6 +85,12 @@ module.exports.tests.text_parser = function(test, common) {
}); });
test('no supplied fields should return error', function(t) { test('no supplied fields should return error', function(t) {
var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', {
'pelias-text-analyzer': { parse: function(query) {
t.fail('parse should not have been called');
}
}});
const raw = {}; const raw = {};
const clean = {}; const clean = {};
@ -92,6 +108,12 @@ module.exports.tests.text_parser = function(test, common) {
}); });
test('postalcode-only parsed_text should return error', function(t) { test('postalcode-only parsed_text should return error', function(t) {
var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', {
'pelias-text-analyzer': { parse: function(query) {
t.fail('parse should not have been called');
}
}});
const raw = { const raw = {
postalcode: 'postalcode value' postalcode: 'postalcode value'
}; };
@ -113,6 +135,102 @@ module.exports.tests.text_parser = function(test, common) {
}); });
test('text_analyzer identifying house number should extract it and street', function(t) {
var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', {
'pelias-text-analyzer': { parse: function(query) {
t.equals(query, 'Number Value Street Value Number Value');
return {
number: 'Number Value'
};
}
}});
const raw = {
address: 'Number Value Street Value Number Value'
};
const clean = {};
const expected_clean = {
parsed_text: {
number: 'Number Value',
street: 'Street Value Number Value'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('text_analyzer identifying postalcode but not house number should assign to number and remove from address', function(t) {
var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', {
'pelias-text-analyzer': { parse: function(query) {
t.equals(query, 'Number Value Street Value Number Value');
return {
postalcode: 'Number Value'
};
}
}});
const raw = {
address: 'Number Value Street Value Number Value'
};
const clean = {};
const expected_clean = {
parsed_text: {
number: 'Number Value',
street: 'Street Value Number Value'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('text_analyzer not revealing possible number should move address to street', function(t) {
var sanitizer = proxyquire('../../../sanitizer/_synthesize_analysis', {
'pelias-text-analyzer': { parse: function(query) {
t.equals(query, 'Street Value');
return {};
}
}});
const raw = {
address: 'Street Value'
};
const clean = {};
const expected_clean = {
parsed_text: {
street: 'Street Value'
}
};
const messages = sanitizer(raw, clean);
t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
}; };
module.exports.all = function (tape, common) { module.exports.all = function (tape, common) {

Loading…
Cancel
Save