|
|
|
var check = require('check-types');
|
|
|
|
var parser = require('addressit');
|
|
|
|
var _ = require('lodash');
|
|
|
|
var logger = require('pelias-logger').get('api');
|
|
|
|
|
|
|
|
// ref: https://en.wikipedia.org/wiki/Quotation_mark
|
|
|
|
const QUOTES = `"'«»‘’‚‛“”„‟‹›⹂「」『』〝〞〟﹁﹂﹃﹄"'「」`;
|
|
|
|
|
|
|
|
// validate texts, convert types and apply defaults
|
|
|
|
function _sanitize( raw, clean ){
|
|
|
|
|
|
|
|
// error & warning messages
|
|
|
|
var messages = { errors: [], warnings: [] };
|
|
|
|
|
|
|
|
// invalid input 'text'
|
|
|
|
const text = _.trim( _.trim( raw.text ), QUOTES );
|
|
|
|
if( !check.nonEmptyString( text ) ){
|
|
|
|
messages.errors.push('invalid param \'text\': text length, must be >0');
|
|
|
|
}
|
|
|
|
|
|
|
|
// valid input 'text'
|
|
|
|
else {
|
|
|
|
|
|
|
|
// valid text
|
|
|
|
clean.text = text;
|
|
|
|
clean.parser = 'addressit';
|
|
|
|
|
|
|
|
// remove anything that may have been parsed before
|
|
|
|
delete clean.parsed_text;
|
|
|
|
|
|
|
|
// parse text with query parser
|
|
|
|
var parsed_text = parse(clean.text);
|
|
|
|
if (check.assigned(parsed_text)) {
|
|
|
|
clean.parsed_text = parsed_text;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return messages;
|
|
|
|
}
|
|
|
|
|
|
|
|
function _expected(){
|
|
|
|
return [{ name: 'text' }];
|
|
|
|
}
|
|
|
|
|
|
|
|
// export function
|
|
|
|
module.exports = () => ({
|
|
|
|
sanitize: _sanitize,
|
|
|
|
expected: _expected
|
|
|
|
});
|
|
|
|
|
|
|
|
// this is the addressit functionality from https://github.com/pelias/text-analyzer/blob/master/src/addressItParser.js
|
|
|
|
var DELIM = ',';
|
|
|
|
|
|
|
|
function parse(query) {
|
|
|
|
var getAdminPartsBySplittingOnDelim = function(queryParts) {
|
|
|
|
// naive approach - for admin matching during query time
|
|
|
|
// split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny'
|
|
|
|
|
|
|
|
var address = {};
|
|
|
|
|
|
|
|
if (queryParts.length > 1) {
|
|
|
|
address.name = queryParts[0].trim();
|
|
|
|
|
|
|
|
// 1. slice away all parts after the first one
|
|
|
|
// 2. trim spaces from each part just in case
|
|
|
|
// 3. join the parts back together with appropriate delimiter and spacing
|
|
|
|
address.admin_parts = queryParts.slice(1)
|
|
|
|
.map(function (part) { return part.trim(); })
|
|
|
|
.join(DELIM + ' ');
|
|
|
|
}
|
|
|
|
|
|
|
|
return address;
|
|
|
|
};
|
|
|
|
|
|
|
|
var getAddressParts = function(query) {
|
|
|
|
// perform full address parsing
|
|
|
|
// except on queries so short they obviously can't contain an address
|
|
|
|
if (query.length > 3) {
|
|
|
|
return parser( query );
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
var queryParts = query.split(DELIM);
|
|
|
|
|
|
|
|
var addressWithAdminParts = getAdminPartsBySplittingOnDelim(queryParts);
|
|
|
|
var addressWithAddressParts= getAddressParts(queryParts.join(DELIM + ' '));
|
|
|
|
|
|
|
|
// combine the 2 objects
|
|
|
|
_.extend(addressWithAdminParts, addressWithAddressParts);
|
|
|
|
|
|
|
|
var address_parts = [ 'name',
|
|
|
|
'number',
|
|
|
|
'street',
|
|
|
|
'city',
|
|
|
|
'state',
|
|
|
|
'country',
|
|
|
|
'postalcode',
|
|
|
|
'regions',
|
|
|
|
'admin_parts'
|
|
|
|
];
|
|
|
|
|
|
|
|
var parsed_text = {};
|
|
|
|
|
|
|
|
address_parts.forEach(function(part){
|
|
|
|
if (addressWithAdminParts[part]) {
|
|
|
|
parsed_text[part] = addressWithAdminParts[part];
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
// if all we found was regions, ignore it as it is not enough information to make smarter decisions
|
|
|
|
if (Object.keys(parsed_text).length === 1 && !_.isUndefined(parsed_text.regions)) {
|
|
|
|
logger.info('Ignoring address parser output, regions only', {
|
|
|
|
text: query.text,
|
|
|
|
parsed: parsed_text
|
|
|
|
});
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
return parsed_text;
|
|
|
|
|
|
|
|
}
|