mirror of https://github.com/pelias/api.git
106 lines
3.5 KiB
106 lines
3.5 KiB
|
|
var check = require('check-types'); |
|
|
|
/** |
|
simplified version of the elaticsearch tokenizer, used in order to |
|
be able to detect which tokens are 'complete' (user has finished typing them) |
|
or 'incomplete' (the user has possibly only typed part of the token). |
|
|
|
note: we don't need to strip punctuation as that will be handled on the |
|
elasticsearch side, so sending a token such as 'st.' is not an issue, these |
|
tokens should *not* be modified as the anaylsis can use the punctuation to |
|
infer meaning. |
|
|
|
note: this sanitizer should run *after* the '_text' sanitizer so it can |
|
use the output of clean.parsed_text where available. |
|
**/ |
|
function sanitize( raw, clean ){ |
|
|
|
// error & warning messages |
|
var messages = { errors: [], warnings: [] }; |
|
|
|
// this is the string we will use for analysis |
|
var text = clean.text; |
|
|
|
// a boolean to track whether the input parser successfully ran; or not. |
|
var inputParserRanSuccessfully = false; |
|
|
|
// if the text parser has run then we only tokenize the 'name' section |
|
// of the 'parsed_text' object, ignoring the 'admin' parts. |
|
if( clean.hasOwnProperty('parsed_text') ) { |
|
inputParserRanSuccessfully = true; |
|
|
|
// parsed_text.name is set, this is the highest priority, use this string |
|
if( clean.parsed_text.hasOwnProperty('name') ){ |
|
text = clean.parsed_text.name; // use this string instead |
|
} |
|
|
|
// else handle the case where parsed_text.street was produced but |
|
// no parsed_text.name is produced. |
|
// additionally, handle the case where parsed_text.number is present |
|
// note: the addressit module may also produce parsed_text.unit info |
|
// for now, we discard that information as we don't have an appropriate |
|
else if( clean.parsed_text.hasOwnProperty('street') ){ |
|
text = [ |
|
clean.parsed_text.number, |
|
clean.parsed_text.street |
|
].filter(function(el){return el;}) |
|
.join(' '); // remove empty elements |
|
} |
|
} |
|
|
|
// always set 'clean.tokens*' arrays for consistency and to avoid upstream errors. |
|
clean.tokens = []; |
|
clean.tokens_complete = []; |
|
clean.tokens_incomplete = []; |
|
|
|
// sanity check that the text is valid. |
|
if( check.nonEmptyString( text ) ){ |
|
|
|
// split according to the regex used in the elasticsearch tokenizer |
|
// see: https://github.com/pelias/schema/blob/master/settings.js |
|
// see: settings.analysis.tokenizer.peliasNameTokenizer |
|
clean.tokens = text |
|
.split(/[\s,\\\/]+/) // split on delimeters |
|
.filter(function(el){return el;}); // remove empty elements |
|
} |
|
|
|
/** |
|
the following section splits the tokens in to two arrays called |
|
'tokens_complete' and 'tokens_incomplete'. |
|
|
|
it also strips any tokens from 'tokens_incomplete' which might not |
|
match the ngrams index (such as single grams not stored in the index). |
|
**/ |
|
|
|
// split the tokens in to 'complete' and 'incomplete'. |
|
if( clean.tokens.length ){ |
|
|
|
// if all the tokens are complete, simply copy them from clean.tokens |
|
if( inputParserRanSuccessfully ){ |
|
|
|
// all these tokens are complete! |
|
clean.tokens_complete = clean.tokens.slice(); |
|
|
|
// user hasn't finished typing yet |
|
} else { |
|
|
|
// make a copy of the tokens and remove the last element |
|
var tokensCopy = clean.tokens.slice(), |
|
lastToken = tokensCopy.pop(); |
|
|
|
// set all but the last token as 'complete' |
|
clean.tokens_complete = tokensCopy; |
|
|
|
if( lastToken ){ |
|
clean.tokens_incomplete = [ lastToken ]; |
|
} |
|
} |
|
|
|
} |
|
|
|
return messages; |
|
} |
|
|
|
// export function |
|
module.exports = sanitize;
|
|
|