Browse Source

added addressit support for autocomplete via separatee text sanitiser

this commit combines the other sanitiser/_text.js and addressit parser logic from text-analyzer into one module for easier integration until such time that libpostal is ready for autocomplete
pull/666/head
Stephen Hess 9 years ago
parent
commit
b612b2750e
  1. 1
      package.json
  2. 107
      sanitiser/_text_autocomplete.js
  3. 276
      test/unit/sanitiser/_text_autocomplete.js

1
package.json

@ -35,6 +35,7 @@
"node": ">=0.10.26"
},
"dependencies": {
"addressit": "git://github.com/dianashk/addressit.git#temp",
"async": "^2.0.0",
"check-types": "^7.0.0",
"elasticsearch": "^11.0.0",

107
sanitiser/_text_autocomplete.js

@ -0,0 +1,107 @@
var check = require('check-types');
var parser = require('addressit');
var extend = require('extend');
var _ = require('lodash');
var logger = require('pelias-logger').get('api');
// validate texts, convert types and apply defaults
function sanitize( raw, clean ){
// error & warning messages
var messages = { errors: [], warnings: [] };
// invalid input 'text'
if( !check.nonEmptyString( raw.text ) ){
messages.errors.push('invalid param \'text\': text length, must be >0');
}
// valid input 'text'
else {
// valid text
clean.text = raw.text;
// parse text with query parser
var parsed_text = parse(clean.text);
if (check.assigned(parsed_text)) {
clean.parsed_text = parsed_text;
}
}
return messages;
}
// export function
module.exports = sanitize;
// this is the addressit functionality from https://github.com/pelias/text-analyzer/blob/master/src/addressItParser.js
var DELIM = ',';
function parse(query) {
var getAdminPartsBySplittingOnDelim = function(queryParts) {
// naive approach - for admin matching during query time
// split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny'
var address = {};
if (queryParts.length > 1) {
address.name = queryParts[0].trim();
// 1. slice away all parts after the first one
// 2. trim spaces from each part just in case
// 3. join the parts back together with appropriate delimiter and spacing
address.admin_parts = queryParts.slice(1)
.map(function (part) { return part.trim(); })
.join(DELIM + ' ');
}
return address;
};
var getAddressParts = function(query) {
// perform full address parsing
// except on queries so short they obviously can't contain an address
if (query.length > 3) {
return parser( query );
}
};
var queryParts = query.split(DELIM);
var addressWithAdminParts = getAdminPartsBySplittingOnDelim(queryParts);
var addressWithAddressParts= getAddressParts(queryParts.join(DELIM + ' '));
var parsedAddress = extend(addressWithAdminParts,
addressWithAddressParts);
var address_parts = [ 'name',
'number',
'street',
'city',
'state',
'country',
'postalcode',
'regions',
'admin_parts'
];
var parsed_text = {};
address_parts.forEach(function(part){
if (parsedAddress[part]) {
parsed_text[part] = parsedAddress[part];
}
});
// if all we found was regions, ignore it as it is not enough information to make smarter decisions
if (Object.keys(parsed_text).length === 1 && !_.isUndefined(parsed_text.regions))
{
logger.info('Ignoring address parser output, regions only');
return null;
}
return parsed_text;
}

276
test/unit/sanitiser/_text_autocomplete.js

@ -0,0 +1,276 @@
var sanitiser = require('../../../sanitiser/_text_autocomplete');
var type_mapping = require('../../../helper/type_mapping');
module.exports.tests = {};
module.exports.tests.text_parser = function(test, common) {
test('short input text has admin layers set ', function(t) {
var raw = {
text: 'emp' //start of empire state building
};
var clean = {
};
var messages = sanitiser(raw, clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
var queries = [
{ name: 'soho', admin_parts: 'new york' },
{ name: 'chelsea', admin_parts: 'london' },
{ name: '123 main', admin_parts: 'new york' }
];
queries.forEach(function (query) {
test('naive parsing ' + query, function(t) {
var raw = {
text: query.name + ', ' + query.admin_parts
};
var clean = {};
var expected_clean = {
text: query.name + ', ' + query.admin_parts,
parsed_text: {
name: query.name,
regions: [ query.name, query.admin_parts ],
admin_parts: query.admin_parts
}
};
var messages = sanitiser(raw, clean);
t.deepEqual(messages, { errors: [], warnings: [] } );
t.deepEqual(clean, expected_clean);
t.end();
});
test('naive parsing ' + query + ' without spaces', function(t) {
var raw = {
text: query.name + ',' + query.admin_parts
};
var clean = {};
var expected_clean = {
text: query.name + ',' + query.admin_parts,
parsed_text: {
name: query.name,
regions: [ query.name, query.admin_parts ],
admin_parts: query.admin_parts
}
};
var messages = sanitiser(raw, clean);
t.deepEqual(messages, { errors: [], warnings: [] } );
t.deepEqual(clean, expected_clean);
t.end();
});
});
test('query with one token', function (t) {
var raw = {
text: 'yugolsavia'
};
var clean = {};
var expected_clean = {
text: 'yugolsavia'
};
var messages = sanitiser(raw, clean);
t.deepEqual(messages, { errors: [], warnings: [] } );
t.deepEqual(clean, expected_clean);
t.end();
});
test('query with two tokens, no numbers', function (t) {
var raw = {
text: 'small town'
};
var clean = {};
var expected_clean = {
text: 'small town'
};
var messages = sanitiser(raw, clean);
t.deepEqual(messages, { errors: [], warnings: [] } );
t.deepEqual(clean, expected_clean);
t.end();
});
test('query with two tokens, number first', function (t) {
var raw = {
text: '123 main'
};
var clean = {};
var expected_clean = {
text: '123 main'
};
var messages = sanitiser(raw, clean);
t.deepEqual(messages, { errors: [], warnings: [] } );
t.deepEqual(clean, expected_clean);
t.end();
});
test('query with two tokens, number second', function (t) {
var raw = {
text: 'main 123'
};
var clean = {};
var expected_clean = {
text: 'main 123'
};
var messages = sanitiser(raw, clean);
t.deepEqual(messages, { errors: [], warnings: [] } );
t.deepEqual(clean, expected_clean);
t.end();
});
test('query with many tokens', function(t) {
var raw = {
text: 'main particle new york'
};
var clean = {};
var expected_clean = {
text: 'main particle new york'
};
var messages = sanitiser(raw, clean);
t.deepEqual(messages, { errors: [], warnings: [] } );
t.deepEqual(clean, expected_clean);
t.end();
});
test('valid address, house number', function(t) {
var raw = {
text: '123 main st new york ny'
};
var clean = {};
var expected_clean = {
text: '123 main st new york ny',
parsed_text: {
number: '123',
street: 'main st',
state: 'NY',
regions: [ 'new york' ]
}
};
var messages = sanitiser(raw, clean);
t.deepEqual(messages, { errors: [], warnings: [] } );
t.deepEqual(clean, expected_clean);
t.end();
});
test('valid address, zipcode', function(t) {
var raw = {
text: '123 main st new york ny 10010'
};
var clean = {};
var expected_clean = {
text: '123 main st new york ny 10010',
parsed_text: {
number: '123',
street: 'main st',
state: 'NY',
postalcode: '10010',
regions: [ 'new york' ]
}
};
var messages = sanitiser(raw, clean);
t.deepEqual(messages, { errors: [], warnings: [] } );
t.deepEqual(clean, expected_clean);
t.end();
});
test('valid address with leading 0s in zipcode', function(t) {
var raw = {
text: '339 W Main St, Cheshire, 06410'
};
var clean = {};
var expected_clean = {
text: '339 W Main St, Cheshire, 06410',
parsed_text: {
name: '339 W Main St',
number: '339',
street: 'W Main St',
postalcode: '06410',
regions: [ 'Cheshire' ],
admin_parts: 'Cheshire, 06410'
}
};
var messages = sanitiser(raw, clean);
t.deepEqual(messages, { errors: [], warnings: [] } );
t.deepEqual(clean, expected_clean);
t.end();
});
test('valid address without spaces after commas', function(t) {
var raw = {
text: '339 W Main St,Lancaster,PA'
};
var clean = {};
var expected_clean = {
text: '339 W Main St,Lancaster,PA',
parsed_text: {
name: '339 W Main St',
number: '339',
street: 'W Main St',
state: 'PA',
regions: [ 'Lancaster' ],
admin_parts: 'Lancaster, PA'
}
};
var messages = sanitiser(raw, clean);
t.deepEqual(messages, { errors: [], warnings: [] } );
t.deepEqual(clean, expected_clean);
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('SANITISER _text: ' + name, testFunction);
}
for( var testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};
Loading…
Cancel
Save