api/query/autocomplete.js

const peliasQuery = require('pelias-query');
const config = require('pelias-config').generate();
const defaults = require('./autocomplete_defaults');
const textParser = require('./text_parser_addressit');
const check = require('check-types');
const logger = require('pelias-logger').get('api');

// additional views (these may be merged in to pelias/query at a later date)
var views = {
  ngrams_strict:              require('./view/ngrams_strict'),
  ngrams_last_token_only:     require('./view/ngrams_last_token_only'),
  phrase_first_tokens_only:   require('./view/phrase_first_tokens_only'),
  pop_subquery:               require('./view/pop_subquery'),
  boost_exact_matches:        require('./view/boost_exact_matches')
};

//------------------------------
// autocomplete query
//------------------------------
var query = new peliasQuery.layout.FilteredBooleanQuery();

// mandatory matches
query.score( views.phrase_first_tokens_only, 'must' );
query.score( views.ngrams_last_token_only, 'must' );

// address components
query.score( peliasQuery.view.address('housenumber') );
query.score( peliasQuery.view.address('street') );
query.score( peliasQuery.view.address('postcode') );

// admin components
query.score( peliasQuery.view.admin('country') );
query.score( peliasQuery.view.admin('country_a') );
query.score( peliasQuery.view.admin('region') );
query.score( peliasQuery.view.admin('region_a') );
query.score( peliasQuery.view.admin('county') );
query.score( peliasQuery.view.admin('borough') );
query.score( peliasQuery.view.admin('localadmin') );
query.score( peliasQuery.view.admin('locality') );
query.score( peliasQuery.view.admin('neighbourhood') );

// scoring boost
query.score( views.boost_exact_matches );
query.score( peliasQuery.view.focus( views.ngrams_strict ) );
query.score( peliasQuery.view.popularity( views.pop_subquery ) );
query.score( peliasQuery.view.population( views.pop_subquery ) );

// non-scoring hard filters
query.filter( peliasQuery.view.sources );
query.filter( peliasQuery.view.layers );
query.filter( peliasQuery.view.boundary_rect );
query.filter( peliasQuery.view.boundary_country );
query.filter( peliasQuery.view.boundary_circle );

// --------------------------------

/**
  map request variables to query variables for all inputs
  provided by this HTTP request.
**/
function generateQuery( clean ){

  const vs = new peliasQuery.Vars( defaults );

  // sources
  if( check.array(clean.sources) && clean.sources.length ){
    vs.var( 'sources', clean.sources );
  }

  // layers
  if( check.array(clean.layers) && clean.layers.length ){
    vs.var( 'layers', clean.layers);
  }

  // boundary country
  if( check.string(clean['boundary.country']) ){
    vs.set({
      'boundary:country': clean['boundary.country']
    });
  }

  // pass the input tokens to the views so they can choose which tokens
  // are relevant for their specific function.
  if( check.array( clean.tokens ) ){
    vs.var( 'input:name:tokens', clean.tokens );
    vs.var( 'input:name:tokens_complete', clean.tokens_complete );
    vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete );
  }

  // input text
  vs.var( 'input:name', clean.text );

  // if the tokenizer has run then we set 'input:name' to as the combination of the
  // 'complete' tokens with the 'incomplete' tokens, the resuting array differs
  // slightly from the 'input:name:tokens' array as some tokens might have been
  // removed in the process; such as single grams which are not present in then
  // ngrams index.
  if( check.array( clean.tokens_complete ) && check.array( clean.tokens_incomplete ) ){
    var combined = clean.tokens_complete.concat( clean.tokens_incomplete );
    if( combined.length ){
      vs.var( 'input:name', combined.join(' ') );
    }
  }

  // focus point
  if( check.number(clean['focus.point.lat']) &&
      check.number(clean['focus.point.lon']) ){
    vs.set({
      'focus:point:lat': clean['focus.point.lat'],
      'focus:point:lon': clean['focus.point.lon']
    });

    // search only near the focus.point for short inputs
    // this reduces the numer of documents hit and keeps latency low
    const hardLimitTextLength = config.get('api.autocomplete.focusHardLimitTextLength') || 0;
    const distanceMultplier   = config.get('api.autocomplete.focusHardLimitMultiplier') || 50;
    if (clean.text.length < hardLimitTextLength) {
      vs.set({
        'boundary:circle:lat': clean['focus.point.lat'],
        'boundary:circle:lon': clean['focus.point.lon'],
        'boundary:circle:radius': `${50 * clean.text.length}km`
      });
    }
  }

  // boundary rect
  if( check.number(clean['boundary.rect.min_lat']) &&
      check.number(clean['boundary.rect.max_lat']) &&
      check.number(clean['boundary.rect.min_lon']) &&
      check.number(clean['boundary.rect.max_lon']) ){
    vs.set({
      'boundary:rect:top': clean['boundary.rect.max_lat'],
      'boundary:rect:right': clean['boundary.rect.max_lon'],
      'boundary:rect:bottom': clean['boundary.rect.min_lat'],
      'boundary:rect:left': clean['boundary.rect.min_lon']
    });
  }

  // run the address parser
  if( clean.parsed_text ){
    textParser( clean.parsed_text, vs );
  }

  return {
    type: 'autocomplete',
    body: query.render(vs)
  };
}

module.exports = generateQuery;
fix: add parser and query param logging 8 years ago			`const peliasQuery = require('pelias-query');`
feat(autocomplete) add hard distance filter to short focus.point queries Short autocomplete inputs are very difficult to serve in a performant and low-latency way. With shorter inputs, many more documents match for just about any input string. In our testing, one to three character input texts generally match up to 100 million documents out of a 560 million document full planet build. There's really no way to make scoring 100 million documents fast, so in order to achieve acceptable performance (ideally, <100ms P99 latency), it's worth looking at ways to either avoid querying Elasticsearch or reducing the scope of autocomplete queries. Short autocomplete queries without a focus.point parameter can be cached. There are only 47,000 possible 1-3 character alphanumerical inputs. At this time, caching is outside the scope of Pelias itself but can easily be implemented with Varnish, Nginx, Fastly, Cloudfront, and lots of other tools and services. Queries with a `focus.point` are effectively uncachable however, since the coordinate chosen will often be unique. This PR uses the `focus.point` coordinate to build a hard filter limiting the search to documents only within a certain radius of the coordinate. This can reduce the number of documents searched and improve performance, while still returning results that are useful. It takes two parameters, driven by `pelias-config`: - `api.autocomplete.focusHardLimitTextLength': the maximum length of text for which a hard distance filter will be constructed - `api.autocomplete.focusHardLimitMultiplier`: the length of the input text will be multiplied by this number to get the total hard filter radius in kilometers. For example, with `focusHardLimitTextLength` 4, and `focusHardLimitMultiplier` 50, the following hard filters would be constructed: \| text length \| max distance \| \| ---- \| ----\| \| 1 \| 50 \| \| 2 \| 100 \| \| 3 \| 150 \| \| 4+ \| unlimited \| 6 years ago			`const config = require('pelias-config').generate();`
fix: add parser and query param logging 8 years ago			`const defaults = require('./autocomplete_defaults');`
			`const textParser = require('./text_parser_addressit');`
			`const check = require('check-types');`
			`const logger = require('pelias-logger').get('api');`
add autocomplete route, further query clean up 9 years ago
simplify pass 1 9 years ago			`// additional views (these may be merged in to pelias/query at a later date)`
simplify pass 2 9 years ago			`var views = {`
simplify pass 3 9 years ago			`ngrams_strict: require('./view/ngrams_strict'),`
			`ngrams_last_token_only: require('./view/ngrams_last_token_only'),`
increase focus weight from 10->40 and simplify population/popularity subview 9 years ago			`phrase_first_tokens_only: require('./view/phrase_first_tokens_only'),`
add a view to boost exact matches 9 years ago			`pop_subquery: require('./view/pop_subquery'),`
			`boost_exact_matches: require('./view/boost_exact_matches')`
more tweaks 9 years ago			`};`
autocomplete tweaks 9 years ago
more tweaks 9 years ago			`//------------------------------`
			`// autocomplete query`
			`//------------------------------`
			`var query = new peliasQuery.layout.FilteredBooleanQuery();`
autocomplete tweaks 9 years ago
more tweaks 9 years ago			`// mandatory matches`
simplify pass 3 9 years ago			`query.score( views.phrase_first_tokens_only, 'must' );`
			`query.score( views.ngrams_last_token_only, 'must' );`
more tweaks 9 years ago
more tweaks 9 years ago			`// address components`
			`query.score( peliasQuery.view.address('housenumber') );`
			`query.score( peliasQuery.view.address('street') );`
			`query.score( peliasQuery.view.address('postcode') );`
more tweaks 9 years ago
more tweaks 9 years ago			`// admin components`
Remove all usage of alpha3/admin0/admin1, update query building to reflect new names in pelias-query 9 years ago			`query.score( peliasQuery.view.admin('country') );`
			`query.score( peliasQuery.view.admin('country_a') );`
			`query.score( peliasQuery.view.admin('region') );`
			`query.score( peliasQuery.view.admin('region_a') );`
			`query.score( peliasQuery.view.admin('county') );`
fix borough matching for both autocomplete and search endpoints 9 years ago			`query.score( peliasQuery.view.admin('borough') );`
Remove all usage of alpha3/admin0/admin1, update query building to reflect new names in pelias-query 9 years ago			`query.score( peliasQuery.view.admin('localadmin') );`
more tweaks 9 years ago			`query.score( peliasQuery.view.admin('locality') );`
Remove all usage of alpha3/admin0/admin1, update query building to reflect new names in pelias-query 9 years ago			`query.score( peliasQuery.view.admin('neighbourhood') );`
more tweaks 9 years ago
more tweaks 9 years ago			`// scoring boost`
add a view to boost exact matches 9 years ago			`query.score( views.boost_exact_matches );`
Remove focus_selected_layers query view This query extends the standard focus query view with hardcoded layers for which the query applies. The intent was to apply the focus scoring only to non-admin areas, but the list of layers was already out of date, as it was missing streets. The query is fundamentally problematic with custom layers as well. 7 years ago			`query.score( peliasQuery.view.focus( views.ngrams_strict ) );`
increase focus weight from 10->40 and simplify population/popularity subview 9 years ago			`query.score( peliasQuery.view.popularity( views.pop_subquery ) );`
			`query.score( peliasQuery.view.population( views.pop_subquery ) );`
add autocomplete route, further query clean up 9 years ago
bugfixes and more tests 9 years ago			`// non-scoring hard filters`
			`query.filter( peliasQuery.view.sources );`
Set up layer filtering for autocomplete and reverse This was missed by me when working on https://github.com/pelias/api/pull/580, but caught by the acceptance tests! Unfortunately it was caught after going to production. 9 years ago			`query.filter( peliasQuery.view.layers );`
Add boundary.rect handling to query/autocomplete 8 years ago			`query.filter( peliasQuery.view.boundary_rect );`
fix(boundary.country): use boundary.country as filter By definition, all boundary.country query matches will either be identical, or not a match. Thus, it does not make sense to put the query clause for boundary.country in the `must` section of the query. In theory, because our queries would generally combine this `must` clause with others, there shouldn't be any performance improvement (or regression) from this change. However, semantically, this clause fits better as a `filter`, and in the case of a bug causing a degenerate query with the `boundary.country` query clause as the only one under the `must` section, this would have a big impact. 6 years ago			`query.filter( peliasQuery.view.boundary_country );`
feat(autocomplete) add hard distance filter to short focus.point queries Short autocomplete inputs are very difficult to serve in a performant and low-latency way. With shorter inputs, many more documents match for just about any input string. In our testing, one to three character input texts generally match up to 100 million documents out of a 560 million document full planet build. There's really no way to make scoring 100 million documents fast, so in order to achieve acceptable performance (ideally, <100ms P99 latency), it's worth looking at ways to either avoid querying Elasticsearch or reducing the scope of autocomplete queries. Short autocomplete queries without a focus.point parameter can be cached. There are only 47,000 possible 1-3 character alphanumerical inputs. At this time, caching is outside the scope of Pelias itself but can easily be implemented with Varnish, Nginx, Fastly, Cloudfront, and lots of other tools and services. Queries with a `focus.point` are effectively uncachable however, since the coordinate chosen will often be unique. This PR uses the `focus.point` coordinate to build a hard filter limiting the search to documents only within a certain radius of the coordinate. This can reduce the number of documents searched and improve performance, while still returning results that are useful. It takes two parameters, driven by `pelias-config`: - `api.autocomplete.focusHardLimitTextLength': the maximum length of text for which a hard distance filter will be constructed - `api.autocomplete.focusHardLimitMultiplier`: the length of the input text will be multiplied by this number to get the total hard filter radius in kilometers. For example, with `focusHardLimitTextLength` 4, and `focusHardLimitMultiplier` 50, the following hard filters would be constructed: \| text length \| max distance \| \| ---- \| ----\| \| 1 \| 50 \| \| 2 \| 100 \| \| 3 \| 150 \| \| 4+ \| unlimited \| 6 years ago			`query.filter( peliasQuery.view.boundary_circle );`
bugfixes and more tests 9 years ago
add autocomplete route, further query clean up 9 years ago			`// --------------------------------`

			`/**`
			`map request variables to query variables for all inputs`
			`provided by this HTTP request.`
			`**/`
			`function generateQuery( clean ){`

fix: add parser and query param logging 8 years ago			`const vs = new peliasQuery.Vars( defaults );`

bugfixes and more tests 9 years ago			`// sources`
			`if( check.array(clean.sources) && clean.sources.length ){`
			`vs.var( 'sources', clean.sources );`
			`}`

Set up layer filtering for autocomplete and reverse This was missed by me when working on https://github.com/pelias/api/pull/580, but caught by the acceptance tests! Unfortunately it was caught after going to production. 9 years ago			`// layers`
			`if( check.array(clean.layers) && clean.layers.length ){`
			`vs.var( 'layers', clean.layers);`
			`}`

Add boundary.country filter to /v1/autocomplete 8 years ago			`// boundary country`
			`if( check.string(clean['boundary.country']) ){`
			`vs.set({`
			`'boundary:country': clean['boundary.country']`
			`});`
			`}`

add tokenizer, refactor how we determine if a token is 'complete' or 'incomplete' 9 years ago			`// pass the input tokens to the views so they can choose which tokens`
			`// are relevant for their specific function.`
			`if( check.array( clean.tokens ) ){`
			`vs.var( 'input:name:tokens', clean.tokens );`
			`vs.var( 'input:name:tokens_complete', clean.tokens_complete );`
			`vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete );`
			`}`

			`// input text`
			`vs.var( 'input:name', clean.text );`
hack hack hack 9 years ago
ensure that problematic single grams are removed from the query 9 years ago			`// if the tokenizer has run then we set 'input:name' to as the combination of the`
			`// 'complete' tokens with the 'incomplete' tokens, the resuting array differs`
			`// slightly from the 'input:name:tokens' array as some tokens might have been`
			`// removed in the process; such as single grams which are not present in then`
			`// ngrams index.`
			`if( check.array( clean.tokens_complete ) && check.array( clean.tokens_incomplete ) ){`
			`var combined = clean.tokens_complete.concat( clean.tokens_incomplete );`
			`if( combined.length ){`
			`vs.var( 'input:name', combined.join(' ') );`
			`}`
hack hack hack 9 years ago			`}`
more tweaks 9 years ago
add autocomplete route, further query clean up 9 years ago			`// focus point`
more conformance 9 years ago			`if( check.number(clean['focus.point.lat']) &&`
			`check.number(clean['focus.point.lon']) ){`
add autocomplete route, further query clean up 9 years ago			`vs.set({`
Use flat clean structure in query/autocomplete.js 9 years ago			`'focus:point:lat': clean['focus.point.lat'],`
			`'focus:point:lon': clean['focus.point.lon']`
add autocomplete route, further query clean up 9 years ago			`});`
feat(autocomplete) add hard distance filter to short focus.point queries Short autocomplete inputs are very difficult to serve in a performant and low-latency way. With shorter inputs, many more documents match for just about any input string. In our testing, one to three character input texts generally match up to 100 million documents out of a 560 million document full planet build. There's really no way to make scoring 100 million documents fast, so in order to achieve acceptable performance (ideally, <100ms P99 latency), it's worth looking at ways to either avoid querying Elasticsearch or reducing the scope of autocomplete queries. Short autocomplete queries without a focus.point parameter can be cached. There are only 47,000 possible 1-3 character alphanumerical inputs. At this time, caching is outside the scope of Pelias itself but can easily be implemented with Varnish, Nginx, Fastly, Cloudfront, and lots of other tools and services. Queries with a `focus.point` are effectively uncachable however, since the coordinate chosen will often be unique. This PR uses the `focus.point` coordinate to build a hard filter limiting the search to documents only within a certain radius of the coordinate. This can reduce the number of documents searched and improve performance, while still returning results that are useful. It takes two parameters, driven by `pelias-config`: - `api.autocomplete.focusHardLimitTextLength': the maximum length of text for which a hard distance filter will be constructed - `api.autocomplete.focusHardLimitMultiplier`: the length of the input text will be multiplied by this number to get the total hard filter radius in kilometers. For example, with `focusHardLimitTextLength` 4, and `focusHardLimitMultiplier` 50, the following hard filters would be constructed: \| text length \| max distance \| \| ---- \| ----\| \| 1 \| 50 \| \| 2 \| 100 \| \| 3 \| 150 \| \| 4+ \| unlimited \| 6 years ago
			`// search only near the focus.point for short inputs`
			`// this reduces the numer of documents hit and keeps latency low`
			`const hardLimitTextLength = config.get('api.autocomplete.focusHardLimitTextLength') \|\| 0;`
			`const distanceMultplier = config.get('api.autocomplete.focusHardLimitMultiplier') \|\| 50;`
			`if (clean.text.length < hardLimitTextLength) {`
			`vs.set({`
			`'boundary:circle:lat': clean['focus.point.lat'],`
			`'boundary:circle:lon': clean['focus.point.lon'],`
			'boundary:circle:radius': `${50 * clean.text.length}km`
			`});`
			`}`
add autocomplete route, further query clean up 9 years ago			`}`

Add boundary.rect handling to query/autocomplete 8 years ago			`// boundary rect`
			`if( check.number(clean['boundary.rect.min_lat']) &&`
			`check.number(clean['boundary.rect.max_lat']) &&`
			`check.number(clean['boundary.rect.min_lon']) &&`
			`check.number(clean['boundary.rect.max_lon']) ){`
			`vs.set({`
			`'boundary:rect:top': clean['boundary.rect.max_lat'],`
			`'boundary:rect:right': clean['boundary.rect.max_lon'],`
			`'boundary:rect:bottom': clean['boundary.rect.min_lat'],`
			`'boundary:rect:left': clean['boundary.rect.min_lon']`
			`});`
			`}`

enable text parsing 9 years ago			`// run the address parser`
			`if( clean.parsed_text ){`
			`textParser( clean.parsed_text, vs );`
			`}`

change the query module interfaces back to simple functions 8 years ago			`return {`
			`type: 'autocomplete',`
			`body: query.render(vs)`
			`};`
add autocomplete route, further query clean up 9 years ago			`}`

Add boundary.rect handling to query/autocomplete 8 years ago			`module.exports = generateQuery;`