You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

121 lines
3.4 KiB

var check = require('check-types');
var parser = require('addressit');
var _ = require('lodash');
var logger = require('pelias-logger').get('api');
// ref: https://en.wikipedia.org/wiki/Quotation_mark
const QUOTES = `"'«»‘’‚‛“”„‟‹›⹂「」『』〝〞〟﹁﹂﹃﹄"'「」`;
// validate texts, convert types and apply defaults
function _sanitize( raw, clean ){
// error & warning messages
var messages = { errors: [], warnings: [] };
// invalid input 'text'
const text = _.trim( _.trim( raw.text ), QUOTES );
if( !check.nonEmptyString( text ) ){
messages.errors.push('invalid param \'text\': text length, must be >0');
}
// valid input 'text'
else {
// valid text
clean.text = text;
clean.parser = 'addressit';
// remove anything that may have been parsed before
delete clean.parsed_text;
// parse text with query parser
var parsed_text = parse(clean.text);
if (check.assigned(parsed_text)) {
clean.parsed_text = parsed_text;
}
}
return messages;
}
function _expected(){
return [{ name: 'text' }];
}
// export function
module.exports = () => ({
sanitize: _sanitize,
expected: _expected
});
// this is the addressit functionality from https://github.com/pelias/text-analyzer/blob/master/src/addressItParser.js
var DELIM = ',';
function parse(query) {
var getAdminPartsBySplittingOnDelim = function(queryParts) {
// naive approach - for admin matching during query time
// split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny'
var address = {};
if (queryParts.length > 1) {
address.name = queryParts[0].trim();
// 1. slice away all parts after the first one
// 2. trim spaces from each part just in case
// 3. join the parts back together with appropriate delimiter and spacing
address.admin_parts = queryParts.slice(1)
.map(function (part) { return part.trim(); })
.join(DELIM + ' ');
}
return address;
};
var getAddressParts = function(query) {
// perform full address parsing
// except on queries so short they obviously can't contain an address
if (query.length > 3) {
return parser( query );
}
};
var queryParts = query.split(DELIM);
var addressWithAdminParts = getAdminPartsBySplittingOnDelim(queryParts);
var addressWithAddressParts= getAddressParts(queryParts.join(DELIM + ' '));
// combine the 2 objects
_.extend(addressWithAdminParts, addressWithAddressParts);
var address_parts = [ 'name',
'number',
'street',
'city',
'state',
'country',
'postalcode',
'regions',
'admin_parts'
];
var parsed_text = {};
address_parts.forEach(function(part){
if (addressWithAdminParts[part]) {
parsed_text[part] = addressWithAdminParts[part];
}
});
// if all we found was regions, ignore it as it is not enough information to make smarter decisions
if (Object.keys(parsed_text).length === 1 && !_.isUndefined(parsed_text.regions)) {
logger.info('Ignoring address parser output, regions only', {
text: query.text,
parsed: parsed_text
});
return null;
}
return parsed_text;
}