api/sanitizer/_tokenizer.js

var check = require('check-types');

/**
  simplified version of the elaticsearch tokenizer, used in order to
  be able to detect which tokens are 'complete' (user has finished typing them)
  or 'incomplete' (the user has possibly only typed part of the token).

  note: we don't need to strip punctuation as that will be handled on the
  elasticsearch side, so sending a token such as 'st.' is not an issue, these
  tokens should *not* be modified as the anaylsis can use the punctuation to
  infer meaning.

  note: this sanitizer should run *after* the '_text' sanitizer so it can
  use the output of clean.parsed_text where available.
**/
function _sanitize( raw, clean ){

  // error & warning messages
  var messages = { errors: [], warnings: [] };

  // this is the string we will use for analysis
  var text = clean.text;

  // a boolean to track whether the input parser successfully ran; or not.
  var inputParserRanSuccessfully = false;

  // if the text parser has run then we only tokenize the 'name' section
  // of the 'parsed_text' object, ignoring the 'admin' parts.
  if( clean.hasOwnProperty('parsed_text') ) {
    inputParserRanSuccessfully = true;

    // parsed_text.name is set, this is the highest priority, use this string
    if( clean.parsed_text.hasOwnProperty('name') ){
      text = clean.parsed_text.name; // use this string instead
    }

    // else handle the case where parsed_text.street was produced but
    // no parsed_text.name is produced.
    // additionally, handle the case where parsed_text.number is present
    // note: the addressit module may also produce parsed_text.unit info
    // for now, we discard that information as we don't have an appropriate
    else if( clean.parsed_text.hasOwnProperty('street') ){
      text = [
        clean.parsed_text.number,
        clean.parsed_text.street
      ].filter(function(el){return el;})
      .join(' '); // remove empty elements
    }
  }

  // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors.
  clean.tokens = [];
  clean.tokens_complete = [];
  clean.tokens_incomplete = [];

  // sanity check that the text is valid.
  if( check.nonEmptyString( text ) ){

    // split according to the regex used in the elasticsearch tokenizer
    // see: https://github.com/pelias/schema/blob/master/settings.js
    // see: settings.analysis.tokenizer.peliasNameTokenizer
    clean.tokens = text
      .split(/[\s,\\\/]+/) // split on delimeters
      .filter(function(el){return el;}); // remove empty elements
  } else {
    // text is empty, this sanitizer should be a no-op
    return messages;
  }

  /**
    the following section splits the tokens in to two arrays called
    'tokens_complete' and 'tokens_incomplete'.

    it also strips any tokens from 'tokens_incomplete' which might not
    match the ngrams index (such as single grams not stored in the index).
  **/

  // split the tokens in to 'complete' and 'incomplete'.
  if( clean.tokens.length ){

    // if all the tokens are complete, simply copy them from clean.tokens
    if( inputParserRanSuccessfully ){

      // all these tokens are complete!
      clean.tokens_complete = clean.tokens.slice();

    // user hasn't finished typing yet
    } else {

      // make a copy of the tokens and remove the last element
      var tokensCopy = clean.tokens.slice(),
          lastToken = tokensCopy.pop();

      // set all but the last token as 'complete'
      clean.tokens_complete = tokensCopy;

      if( lastToken ){
        clean.tokens_incomplete = [ lastToken ];
      }
    }

  } else {
    // set error if no substantial tokens were found
    messages.errors.push('invalid `text` input: must contain more than just delimiters');
  }

  return messages;
}

// export function
module.exports = () => ({
  sanitize: _sanitize
});
add tokenizer, refactor how we determine if a token is 'complete' or 'incomplete' 9 years ago			`var check = require('check-types');`

			`/**`
			`simplified version of the elaticsearch tokenizer, used in order to`
			`be able to detect which tokens are 'complete' (user has finished typing them)`
			`or 'incomplete' (the user has possibly only typed part of the token).`

			`note: we don't need to strip punctuation as that will be handled on the`
			`elasticsearch side, so sending a token such as 'st.' is not an issue, these`
			`tokens should not be modified as the anaylsis can use the punctuation to`
			`infer meaning.`

			`note: this sanitizer should run after the '_text' sanitizer so it can`
			`use the output of clean.parsed_text where available.`
			`**/`
Standardize sanitizers to export setup function that returns sanitize and expected function 7 years ago			`function _sanitize( raw, clean ){`
add tokenizer, refactor how we determine if a token is 'complete' or 'incomplete' 9 years ago
			`// error & warning messages`
			`var messages = { errors: [], warnings: [] };`

			`// this is the string we will use for analysis`
			`var text = clean.text;`

			`// a boolean to track whether the input parser successfully ran; or not.`
			`var inputParserRanSuccessfully = false;`

			`// if the text parser has run then we only tokenize the 'name' section`
			`// of the 'parsed_text' object, ignoring the 'admin' parts.`
handle addressit case where parsed_text.street is produced and parsed_text.name is not 9 years ago			`if( clean.hasOwnProperty('parsed_text') ) {`
add tokenizer, refactor how we determine if a token is 'complete' or 'incomplete' 9 years ago			`inputParserRanSuccessfully = true;`
handle addressit case where parsed_text.street is produced and parsed_text.name is not 9 years ago
			`// parsed_text.name is set, this is the highest priority, use this string`
			`if( clean.parsed_text.hasOwnProperty('name') ){`
			`text = clean.parsed_text.name; // use this string instead`
			`}`

			`// else handle the case where parsed_text.street was produced but`
			`// no parsed_text.name is produced.`
			`// additionally, handle the case where parsed_text.number is present`
			`// note: the addressit module may also produce parsed_text.unit info`
			`// for now, we discard that information as we don't have an appropriate`
			`else if( clean.parsed_text.hasOwnProperty('street') ){`
			`text = [`
			`clean.parsed_text.number,`
			`clean.parsed_text.street`
			`].filter(function(el){return el;})`
			`.join(' '); // remove empty elements`
			`}`
add tokenizer, refactor how we determine if a token is 'complete' or 'incomplete' 9 years ago			`}`

			`// always set 'clean.tokens*' arrays for consistency and to avoid upstream errors.`
			`clean.tokens = [];`
			`clean.tokens_complete = [];`
			`clean.tokens_incomplete = [];`

			`// sanity check that the text is valid.`
			`if( check.nonEmptyString( text ) ){`

			`// split according to the regex used in the elasticsearch tokenizer`
			`// see: https://github.com/pelias/schema/blob/master/settings.js`
			`// see: settings.analysis.tokenizer.peliasNameTokenizer`
			`clean.tokens = text`
			`.split(/[\s,\\\/]+/) // split on delimeters`
			`.filter(function(el){return el;}); // remove empty elements`
fix(autocomplete): detect the case where input text is unsubstantial It's possible for the `text` input to /v1/autocomplete to be of non-zero length after trimming whitespace and quotes, but still be insufficient to use for geocoding. One common case is that it contains only commas, slashes, or other delimiters. Our query logic currently does not handle this case, and will generate Elasticsearch queries that do not have a primary `must` clause and end up searching every document in the index. These queries are slow, take up cluster resources, and are not useful. By detecting unsubstantial inputs, we can prevent this. 6 years ago			`} else {`
			`// text is empty, this sanitizer should be a no-op`
			`return messages;`
add tokenizer, refactor how we determine if a token is 'complete' or 'incomplete' 9 years ago			`}`

			`/**`
			`the following section splits the tokens in to two arrays called`
			`'tokens_complete' and 'tokens_incomplete'.`

			`it also strips any tokens from 'tokens_incomplete' which might not`
			`match the ngrams index (such as single grams not stored in the index).`
			`**/`

			`// split the tokens in to 'complete' and 'incomplete'.`
			`if( clean.tokens.length ){`

			`// if all the tokens are complete, simply copy them from clean.tokens`
			`if( inputParserRanSuccessfully ){`

			`// all these tokens are complete!`
			`clean.tokens_complete = clean.tokens.slice();`

			`// user hasn't finished typing yet`
			`} else {`

			`// make a copy of the tokens and remove the last element`
			`var tokensCopy = clean.tokens.slice(),`
			`lastToken = tokensCopy.pop();`

			`// set all but the last token as 'complete'`
			`clean.tokens_complete = tokensCopy;`

allow single non-numeric chars, change boost_mode to 'replace' for function_score queries 8 years ago			`if( lastToken ){`
add tokenizer, refactor how we determine if a token is 'complete' or 'incomplete' 9 years ago			`clean.tokens_incomplete = [ lastToken ];`
			`}`
			`}`

fix(autocomplete): detect the case where input text is unsubstantial It's possible for the `text` input to /v1/autocomplete to be of non-zero length after trimming whitespace and quotes, but still be insufficient to use for geocoding. One common case is that it contains only commas, slashes, or other delimiters. Our query logic currently does not handle this case, and will generate Elasticsearch queries that do not have a primary `must` clause and end up searching every document in the index. These queries are slow, take up cluster resources, and are not useful. By detecting unsubstantial inputs, we can prevent this. 6 years ago			`} else {`
			`// set error if no substantial tokens were found`
			messages.errors.push('invalid `text` input: must contain more than just delimiters');
add tokenizer, refactor how we determine if a token is 'complete' or 'incomplete' 9 years ago			`}`

			`return messages;`
			`}`

			`// export function`
Standardize sanitizers to export setup function that returns sanitize and expected function 7 years ago			`module.exports = () => ({`
			`sanitize: _sanitize`
			`});`