You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

151 lines
5.2 KiB

const peliasQuery = require('pelias-query');
feat(autocomplete) add hard distance filter to short focus.point queries Short autocomplete inputs are very difficult to serve in a performant and low-latency way. With shorter inputs, many more documents match for just about any input string. In our testing, one to three character input texts generally match up to 100 million documents out of a 560 million document full planet build. There's really no way to make scoring 100 million documents fast, so in order to achieve acceptable performance (ideally, <100ms P99 latency), it's worth looking at ways to either avoid querying Elasticsearch or reducing the scope of autocomplete queries. Short autocomplete queries without a focus.point parameter can be cached. There are only 47,000 possible 1-3 character alphanumerical inputs. At this time, caching is outside the scope of Pelias itself but can easily be implemented with Varnish, Nginx, Fastly, Cloudfront, and lots of other tools and services. Queries with a `focus.point` are effectively uncachable however, since the coordinate chosen will often be unique. This PR uses the `focus.point` coordinate to build a hard filter limiting the search to documents only within a certain radius of the coordinate. This can reduce the number of documents searched and improve performance, while still returning results that are useful. It takes two parameters, driven by `pelias-config`: - `api.autocomplete.focusHardLimitTextLength': the maximum length of text for which a hard distance filter will be constructed - `api.autocomplete.focusHardLimitMultiplier`: the length of the input text will be multiplied by this number to get the total hard filter radius in kilometers. For example, with `focusHardLimitTextLength` 4, and `focusHardLimitMultiplier` 50, the following hard filters would be constructed: | text length | max distance | | ---- | ----| | 1 | 50 | | 2 | 100 | | 3 | 150 | | 4+ | unlimited |
6 years ago
const config = require('pelias-config').generate();
const defaults = require('./autocomplete_defaults');
const textParser = require('./text_parser_addressit');
const check = require('check-types');
const logger = require('pelias-logger').get('api');
9 years ago
// additional views (these may be merged in to pelias/query at a later date)
9 years ago
var views = {
9 years ago
ngrams_strict: require('./view/ngrams_strict'),
ngrams_last_token_only: require('./view/ngrams_last_token_only'),
phrase_first_tokens_only: require('./view/phrase_first_tokens_only'),
pop_subquery: require('./view/pop_subquery'),
boost_exact_matches: require('./view/boost_exact_matches')
9 years ago
};
9 years ago
//------------------------------
// autocomplete query
//------------------------------
var query = new peliasQuery.layout.FilteredBooleanQuery();
9 years ago
// mandatory matches
9 years ago
query.score( views.phrase_first_tokens_only, 'must' );
query.score( views.ngrams_last_token_only, 'must' );
9 years ago
9 years ago
// address components
query.score( peliasQuery.view.address('housenumber') );
query.score( peliasQuery.view.address('street') );
query.score( peliasQuery.view.address('postcode') );
9 years ago
9 years ago
// admin components
query.score( peliasQuery.view.admin('country') );
query.score( peliasQuery.view.admin('country_a') );
query.score( peliasQuery.view.admin('region') );
query.score( peliasQuery.view.admin('region_a') );
query.score( peliasQuery.view.admin('county') );
query.score( peliasQuery.view.admin('borough') );
query.score( peliasQuery.view.admin('localadmin') );
9 years ago
query.score( peliasQuery.view.admin('locality') );
query.score( peliasQuery.view.admin('neighbourhood') );
9 years ago
9 years ago
// scoring boost
query.score( views.boost_exact_matches );
query.score( peliasQuery.view.focus( views.ngrams_strict ) );
query.score( peliasQuery.view.popularity( views.pop_subquery ) );
query.score( peliasQuery.view.population( views.pop_subquery ) );
// non-scoring hard filters
query.filter( peliasQuery.view.sources );
query.filter( peliasQuery.view.layers );
query.filter( peliasQuery.view.boundary_rect );
query.filter( peliasQuery.view.boundary_country );
feat(autocomplete) add hard distance filter to short focus.point queries Short autocomplete inputs are very difficult to serve in a performant and low-latency way. With shorter inputs, many more documents match for just about any input string. In our testing, one to three character input texts generally match up to 100 million documents out of a 560 million document full planet build. There's really no way to make scoring 100 million documents fast, so in order to achieve acceptable performance (ideally, <100ms P99 latency), it's worth looking at ways to either avoid querying Elasticsearch or reducing the scope of autocomplete queries. Short autocomplete queries without a focus.point parameter can be cached. There are only 47,000 possible 1-3 character alphanumerical inputs. At this time, caching is outside the scope of Pelias itself but can easily be implemented with Varnish, Nginx, Fastly, Cloudfront, and lots of other tools and services. Queries with a `focus.point` are effectively uncachable however, since the coordinate chosen will often be unique. This PR uses the `focus.point` coordinate to build a hard filter limiting the search to documents only within a certain radius of the coordinate. This can reduce the number of documents searched and improve performance, while still returning results that are useful. It takes two parameters, driven by `pelias-config`: - `api.autocomplete.focusHardLimitTextLength': the maximum length of text for which a hard distance filter will be constructed - `api.autocomplete.focusHardLimitMultiplier`: the length of the input text will be multiplied by this number to get the total hard filter radius in kilometers. For example, with `focusHardLimitTextLength` 4, and `focusHardLimitMultiplier` 50, the following hard filters would be constructed: | text length | max distance | | ---- | ----| | 1 | 50 | | 2 | 100 | | 3 | 150 | | 4+ | unlimited |
6 years ago
query.filter( peliasQuery.view.boundary_circle );
// --------------------------------
/**
map request variables to query variables for all inputs
provided by this HTTP request.
**/
function generateQuery( clean ){
const vs = new peliasQuery.Vars( defaults );
// sources
if( check.array(clean.sources) && clean.sources.length ){
vs.var( 'sources', clean.sources );
}
// layers
if( check.array(clean.layers) && clean.layers.length ){
vs.var( 'layers', clean.layers);
}
// boundary country
if( check.string(clean['boundary.country']) ){
vs.set({
'boundary:country': clean['boundary.country']
});
}
// pass the input tokens to the views so they can choose which tokens
// are relevant for their specific function.
if( check.array( clean.tokens ) ){
vs.var( 'input:name:tokens', clean.tokens );
vs.var( 'input:name:tokens_complete', clean.tokens_complete );
vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete );
}
// input text
vs.var( 'input:name', clean.text );
9 years ago
// if the tokenizer has run then we set 'input:name' to as the combination of the
// 'complete' tokens with the 'incomplete' tokens, the resuting array differs
// slightly from the 'input:name:tokens' array as some tokens might have been
// removed in the process; such as single grams which are not present in then
// ngrams index.
if( check.array( clean.tokens_complete ) && check.array( clean.tokens_incomplete ) ){
var combined = clean.tokens_complete.concat( clean.tokens_incomplete );
if( combined.length ){
vs.var( 'input:name', combined.join(' ') );
}
9 years ago
}
9 years ago
// focus point
9 years ago
if( check.number(clean['focus.point.lat']) &&
check.number(clean['focus.point.lon']) ){
vs.set({
'focus:point:lat': clean['focus.point.lat'],
'focus:point:lon': clean['focus.point.lon']
});
feat(autocomplete) add hard distance filter to short focus.point queries Short autocomplete inputs are very difficult to serve in a performant and low-latency way. With shorter inputs, many more documents match for just about any input string. In our testing, one to three character input texts generally match up to 100 million documents out of a 560 million document full planet build. There's really no way to make scoring 100 million documents fast, so in order to achieve acceptable performance (ideally, <100ms P99 latency), it's worth looking at ways to either avoid querying Elasticsearch or reducing the scope of autocomplete queries. Short autocomplete queries without a focus.point parameter can be cached. There are only 47,000 possible 1-3 character alphanumerical inputs. At this time, caching is outside the scope of Pelias itself but can easily be implemented with Varnish, Nginx, Fastly, Cloudfront, and lots of other tools and services. Queries with a `focus.point` are effectively uncachable however, since the coordinate chosen will often be unique. This PR uses the `focus.point` coordinate to build a hard filter limiting the search to documents only within a certain radius of the coordinate. This can reduce the number of documents searched and improve performance, while still returning results that are useful. It takes two parameters, driven by `pelias-config`: - `api.autocomplete.focusHardLimitTextLength': the maximum length of text for which a hard distance filter will be constructed - `api.autocomplete.focusHardLimitMultiplier`: the length of the input text will be multiplied by this number to get the total hard filter radius in kilometers. For example, with `focusHardLimitTextLength` 4, and `focusHardLimitMultiplier` 50, the following hard filters would be constructed: | text length | max distance | | ---- | ----| | 1 | 50 | | 2 | 100 | | 3 | 150 | | 4+ | unlimited |
6 years ago
// search only near the focus.point for short inputs
// this reduces the numer of documents hit and keeps latency low
const hardLimitTextLength = config.get('api.autocomplete.focusHardLimitTextLength') || 0;
const distanceMultplier = config.get('api.autocomplete.focusHardLimitMultiplier') || 50;
if (clean.text.length < hardLimitTextLength) {
vs.set({
'boundary:circle:lat': clean['focus.point.lat'],
'boundary:circle:lon': clean['focus.point.lon'],
'boundary:circle:radius': `${50 * clean.text.length}km`
});
}
}
// boundary rect
if( check.number(clean['boundary.rect.min_lat']) &&
check.number(clean['boundary.rect.max_lat']) &&
check.number(clean['boundary.rect.min_lon']) &&
check.number(clean['boundary.rect.max_lon']) ){
vs.set({
'boundary:rect:top': clean['boundary.rect.max_lat'],
'boundary:rect:right': clean['boundary.rect.max_lon'],
'boundary:rect:bottom': clean['boundary.rect.min_lat'],
'boundary:rect:left': clean['boundary.rect.min_lon']
});
}
// run the address parser
if( clean.parsed_text ){
textParser( clean.parsed_text, vs );
}
return {
type: 'autocomplete',
body: query.render(vs)
};
}
module.exports = generateQuery;