diff --git a/package.json b/package.json index dc099a12..2c4b940f 100644 --- a/package.json +++ b/package.json @@ -35,6 +35,7 @@ "node": ">=0.10.26" }, "dependencies": { + "addressit": "git://github.com/dianashk/addressit.git#temp", "async": "^2.0.0", "check-types": "^7.0.0", "elasticsearch": "^11.0.0", diff --git a/sanitiser/_text_autocomplete.js b/sanitiser/_text_autocomplete.js new file mode 100644 index 00000000..5578c4b1 --- /dev/null +++ b/sanitiser/_text_autocomplete.js @@ -0,0 +1,107 @@ +var check = require('check-types'); +var parser = require('addressit'); +var extend = require('extend'); +var _ = require('lodash'); +var logger = require('pelias-logger').get('api'); + +// validate texts, convert types and apply defaults +function sanitize( raw, clean ){ + + // error & warning messages + var messages = { errors: [], warnings: [] }; + + // invalid input 'text' + if( !check.nonEmptyString( raw.text ) ){ + messages.errors.push('invalid param \'text\': text length, must be >0'); + } + + // valid input 'text' + else { + + // valid text + clean.text = raw.text; + + // parse text with query parser + var parsed_text = parse(clean.text); + if (check.assigned(parsed_text)) { + clean.parsed_text = parsed_text; + } + } + + return messages; +} + +// export function +module.exports = sanitize; + + + +// this is the addressit functionality from https://github.com/pelias/text-analyzer/blob/master/src/addressItParser.js +var DELIM = ','; + +function parse(query) { + var getAdminPartsBySplittingOnDelim = function(queryParts) { + // naive approach - for admin matching during query time + // split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny' + + var address = {}; + + if (queryParts.length > 1) { + address.name = queryParts[0].trim(); + + // 1. slice away all parts after the first one + // 2. trim spaces from each part just in case + // 3. join the parts back together with appropriate delimiter and spacing + address.admin_parts = queryParts.slice(1) + .map(function (part) { return part.trim(); }) + .join(DELIM + ' '); + } + + return address; + }; + + var getAddressParts = function(query) { + // perform full address parsing + // except on queries so short they obviously can't contain an address + if (query.length > 3) { + return parser( query ); + } + }; + + var queryParts = query.split(DELIM); + + var addressWithAdminParts = getAdminPartsBySplittingOnDelim(queryParts); + var addressWithAddressParts= getAddressParts(queryParts.join(DELIM + ' ')); + + var parsedAddress = extend(addressWithAdminParts, + addressWithAddressParts); + + var address_parts = [ 'name', + 'number', + 'street', + 'city', + 'state', + 'country', + 'postalcode', + 'regions', + 'admin_parts' + ]; + + var parsed_text = {}; + + address_parts.forEach(function(part){ + if (parsedAddress[part]) { + parsed_text[part] = parsedAddress[part]; + } + }); + + // if all we found was regions, ignore it as it is not enough information to make smarter decisions + if (Object.keys(parsed_text).length === 1 && !_.isUndefined(parsed_text.regions)) + { + logger.info('Ignoring address parser output, regions only'); + return null; + } + + return parsed_text; + +} diff --git a/test/unit/sanitiser/_text_autocomplete.js b/test/unit/sanitiser/_text_autocomplete.js new file mode 100644 index 00000000..a8911b69 --- /dev/null +++ b/test/unit/sanitiser/_text_autocomplete.js @@ -0,0 +1,276 @@ +var sanitiser = require('../../../sanitiser/_text_autocomplete'); +var type_mapping = require('../../../helper/type_mapping'); + +module.exports.tests = {}; + +module.exports.tests.text_parser = function(test, common) { + test('short input text has admin layers set ', function(t) { + var raw = { + text: 'emp' //start of empire state building + }; + var clean = { + }; + + var messages = sanitiser(raw, clean); + + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + + var queries = [ + { name: 'soho', admin_parts: 'new york' }, + { name: 'chelsea', admin_parts: 'london' }, + { name: '123 main', admin_parts: 'new york' } + ]; + + queries.forEach(function (query) { + test('naive parsing ' + query, function(t) { + var raw = { + text: query.name + ', ' + query.admin_parts + }; + var clean = {}; + + var expected_clean = { + text: query.name + ', ' + query.admin_parts, + parsed_text: { + name: query.name, + regions: [ query.name, query.admin_parts ], + admin_parts: query.admin_parts + } + }; + + var messages = sanitiser(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] } ); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('naive parsing ' + query + ' without spaces', function(t) { + var raw = { + text: query.name + ',' + query.admin_parts + }; + var clean = {}; + + var expected_clean = { + text: query.name + ',' + query.admin_parts, + parsed_text: { + name: query.name, + regions: [ query.name, query.admin_parts ], + admin_parts: query.admin_parts + } + }; + + var messages = sanitiser(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] } ); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + }); + + test('query with one token', function (t) { + var raw = { + text: 'yugolsavia' + }; + var clean = {}; + + var expected_clean = { + text: 'yugolsavia' + }; + + var messages = sanitiser(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] } ); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('query with two tokens, no numbers', function (t) { + var raw = { + text: 'small town' + }; + var clean = {}; + + var expected_clean = { + text: 'small town' + }; + + var messages = sanitiser(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] } ); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('query with two tokens, number first', function (t) { + var raw = { + text: '123 main' + }; + var clean = {}; + + var expected_clean = { + text: '123 main' + }; + + var messages = sanitiser(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] } ); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('query with two tokens, number second', function (t) { + var raw = { + text: 'main 123' + }; + var clean = {}; + + var expected_clean = { + text: 'main 123' + }; + + var messages = sanitiser(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] } ); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('query with many tokens', function(t) { + var raw = { + text: 'main particle new york' + }; + var clean = {}; + + var expected_clean = { + text: 'main particle new york' + }; + + var messages = sanitiser(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] } ); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('valid address, house number', function(t) { + var raw = { + text: '123 main st new york ny' + }; + var clean = {}; + + var expected_clean = { + text: '123 main st new york ny', + parsed_text: { + number: '123', + street: 'main st', + state: 'NY', + regions: [ 'new york' ] + } + }; + + var messages = sanitiser(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] } ); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('valid address, zipcode', function(t) { + var raw = { + text: '123 main st new york ny 10010' + }; + var clean = {}; + + var expected_clean = { + text: '123 main st new york ny 10010', + parsed_text: { + number: '123', + street: 'main st', + state: 'NY', + postalcode: '10010', + regions: [ 'new york' ] + } + }; + + var messages = sanitiser(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] } ); + t.deepEqual(clean, expected_clean); + t.end(); + }); + + test('valid address with leading 0s in zipcode', function(t) { + var raw = { + text: '339 W Main St, Cheshire, 06410' + }; + var clean = {}; + + var expected_clean = { + text: '339 W Main St, Cheshire, 06410', + parsed_text: { + name: '339 W Main St', + number: '339', + street: 'W Main St', + postalcode: '06410', + regions: [ 'Cheshire' ], + admin_parts: 'Cheshire, 06410' + } + }; + + var messages = sanitiser(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] } ); + t.deepEqual(clean, expected_clean); + t.end(); + }); + + test('valid address without spaces after commas', function(t) { + var raw = { + text: '339 W Main St,Lancaster,PA' + }; + var clean = {}; + + var expected_clean = { + text: '339 W Main St,Lancaster,PA', + parsed_text: { + name: '339 W Main St', + number: '339', + street: 'W Main St', + state: 'PA', + regions: [ 'Lancaster' ], + admin_parts: 'Lancaster, PA' + } + }; + + var messages = sanitiser(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] } ); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + +}; + +module.exports.all = function (tape, common) { + function test(name, testFunction) { + return tape('SANITISER _text: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +};