|
|
|
const peliasQuery = require('pelias-query');
|
feat(autocomplete) add hard distance filter to short focus.point queries
Short autocomplete inputs are very difficult to serve in a performant
and low-latency way. With shorter inputs, many more documents match for
just about any input string.
In our testing, one to three character input texts generally match up to
100 million documents out of a 560 million document full planet build.
There's really no way to make scoring 100 million documents fast,
so in order to achieve acceptable performance (ideally, <100ms P99
latency), it's worth looking at ways to either avoid querying
Elasticsearch or reducing the scope of autocomplete queries.
Short autocomplete queries without a focus.point parameter can be
cached. There are only 47,000 possible 1-3 character alphanumerical
inputs. At this time, caching is outside the scope of Pelias itself but
can easily be implemented with Varnish, Nginx, Fastly, Cloudfront, and
lots of other tools and services.
Queries with a `focus.point` are effectively uncachable however, since
the coordinate chosen will often be unique.
This PR uses the `focus.point` coordinate to build a
hard filter limiting the search to documents only within a certain
radius of the coordinate. This can reduce the number of documents
searched and improve performance, while still returning results that are
useful.
It takes two parameters, driven by `pelias-config`:
- `api.autocomplete.focusHardLimitTextLength': the maximum length of text
for which a hard distance filter will be constructed
- `api.autocomplete.focusHardLimitMultiplier`: the length of the input
text will be multiplied by this number to get the total hard filter
radius in kilometers.
For example, with `focusHardLimitTextLength` 4, and
`focusHardLimitMultiplier` 50, the following hard filters would be
constructed:
| text length | max distance |
| ---- | ----|
| 1 | 50 |
| 2 | 100 |
| 3 | 150 |
| 4+ | unlimited |
6 years ago
|
|
|
const config = require('pelias-config').generate();
|
|
|
|
const defaults = require('./autocomplete_defaults');
|
|
|
|
const textParser = require('./text_parser_addressit');
|
|
|
|
const check = require('check-types');
|
|
|
|
const logger = require('pelias-logger').get('api');
|
|
|
|
|
|
|
|
// additional views (these may be merged in to pelias/query at a later date)
|
|
|
|
var views = {
|
|
|
|
ngrams_strict: require('./view/ngrams_strict'),
|
|
|
|
ngrams_last_token_only: require('./view/ngrams_last_token_only'),
|
|
|
|
phrase_first_tokens_only: require('./view/phrase_first_tokens_only'),
|
|
|
|
pop_subquery: require('./view/pop_subquery'),
|
|
|
|
boost_exact_matches: require('./view/boost_exact_matches')
|
|
|
|
};
|
|
|
|
|
|
|
|
//------------------------------
|
|
|
|
// autocomplete query
|
|
|
|
//------------------------------
|
|
|
|
var query = new peliasQuery.layout.FilteredBooleanQuery();
|
|
|
|
|
|
|
|
// mandatory matches
|
|
|
|
query.score( views.phrase_first_tokens_only, 'must' );
|
|
|
|
query.score( views.ngrams_last_token_only, 'must' );
|
|
|
|
|
|
|
|
// address components
|
|
|
|
query.score( peliasQuery.view.address('housenumber') );
|
|
|
|
query.score( peliasQuery.view.address('street') );
|
|
|
|
query.score( peliasQuery.view.address('postcode') );
|
|
|
|
|
|
|
|
// admin components
|
|
|
|
query.score( peliasQuery.view.admin('country') );
|
|
|
|
query.score( peliasQuery.view.admin('country_a') );
|
|
|
|
query.score( peliasQuery.view.admin('region') );
|
|
|
|
query.score( peliasQuery.view.admin('region_a') );
|
|
|
|
query.score( peliasQuery.view.admin('county') );
|
|
|
|
query.score( peliasQuery.view.admin('borough') );
|
|
|
|
query.score( peliasQuery.view.admin('localadmin') );
|
|
|
|
query.score( peliasQuery.view.admin('locality') );
|
|
|
|
query.score( peliasQuery.view.admin('neighbourhood') );
|
|
|
|
|
|
|
|
// scoring boost
|
|
|
|
query.score( views.boost_exact_matches );
|
|
|
|
query.score( peliasQuery.view.focus( views.ngrams_strict ) );
|
|
|
|
query.score( peliasQuery.view.popularity( views.pop_subquery ) );
|
|
|
|
query.score( peliasQuery.view.population( views.pop_subquery ) );
|
|
|
|
|
|
|
|
// non-scoring hard filters
|
|
|
|
query.filter( peliasQuery.view.sources );
|
|
|
|
query.filter( peliasQuery.view.layers );
|
|
|
|
query.filter( peliasQuery.view.boundary_rect );
|
|
|
|
query.filter( peliasQuery.view.boundary_country );
|
feat(autocomplete) add hard distance filter to short focus.point queries
Short autocomplete inputs are very difficult to serve in a performant
and low-latency way. With shorter inputs, many more documents match for
just about any input string.
In our testing, one to three character input texts generally match up to
100 million documents out of a 560 million document full planet build.
There's really no way to make scoring 100 million documents fast,
so in order to achieve acceptable performance (ideally, <100ms P99
latency), it's worth looking at ways to either avoid querying
Elasticsearch or reducing the scope of autocomplete queries.
Short autocomplete queries without a focus.point parameter can be
cached. There are only 47,000 possible 1-3 character alphanumerical
inputs. At this time, caching is outside the scope of Pelias itself but
can easily be implemented with Varnish, Nginx, Fastly, Cloudfront, and
lots of other tools and services.
Queries with a `focus.point` are effectively uncachable however, since
the coordinate chosen will often be unique.
This PR uses the `focus.point` coordinate to build a
hard filter limiting the search to documents only within a certain
radius of the coordinate. This can reduce the number of documents
searched and improve performance, while still returning results that are
useful.
It takes two parameters, driven by `pelias-config`:
- `api.autocomplete.focusHardLimitTextLength': the maximum length of text
for which a hard distance filter will be constructed
- `api.autocomplete.focusHardLimitMultiplier`: the length of the input
text will be multiplied by this number to get the total hard filter
radius in kilometers.
For example, with `focusHardLimitTextLength` 4, and
`focusHardLimitMultiplier` 50, the following hard filters would be
constructed:
| text length | max distance |
| ---- | ----|
| 1 | 50 |
| 2 | 100 |
| 3 | 150 |
| 4+ | unlimited |
6 years ago
|
|
|
query.filter( peliasQuery.view.boundary_circle );
|
|
|
|
|
|
|
|
// --------------------------------
|
|
|
|
|
|
|
|
/**
|
|
|
|
map request variables to query variables for all inputs
|
|
|
|
provided by this HTTP request.
|
|
|
|
**/
|
|
|
|
function generateQuery( clean ){
|
|
|
|
|
|
|
|
const vs = new peliasQuery.Vars( defaults );
|
|
|
|
|
|
|
|
// sources
|
|
|
|
if( check.array(clean.sources) && clean.sources.length ){
|
|
|
|
vs.var( 'sources', clean.sources );
|
|
|
|
}
|
|
|
|
|
|
|
|
// layers
|
|
|
|
if( check.array(clean.layers) && clean.layers.length ){
|
|
|
|
vs.var( 'layers', clean.layers);
|
|
|
|
}
|
|
|
|
|
|
|
|
// boundary country
|
|
|
|
if( check.string(clean['boundary.country']) ){
|
|
|
|
vs.set({
|
|
|
|
'boundary:country': clean['boundary.country']
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
// pass the input tokens to the views so they can choose which tokens
|
|
|
|
// are relevant for their specific function.
|
|
|
|
if( check.array( clean.tokens ) ){
|
|
|
|
vs.var( 'input:name:tokens', clean.tokens );
|
|
|
|
vs.var( 'input:name:tokens_complete', clean.tokens_complete );
|
|
|
|
vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete );
|
|
|
|
}
|
|
|
|
|
|
|
|
// input text
|
|
|
|
vs.var( 'input:name', clean.text );
|
|
|
|
|
|
|
|
// if the tokenizer has run then we set 'input:name' to as the combination of the
|
|
|
|
// 'complete' tokens with the 'incomplete' tokens, the resuting array differs
|
|
|
|
// slightly from the 'input:name:tokens' array as some tokens might have been
|
|
|
|
// removed in the process; such as single grams which are not present in then
|
|
|
|
// ngrams index.
|
|
|
|
if( check.array( clean.tokens_complete ) && check.array( clean.tokens_incomplete ) ){
|
|
|
|
var combined = clean.tokens_complete.concat( clean.tokens_incomplete );
|
|
|
|
if( combined.length ){
|
|
|
|
vs.var( 'input:name', combined.join(' ') );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// focus point
|
|
|
|
if( check.number(clean['focus.point.lat']) &&
|
|
|
|
check.number(clean['focus.point.lon']) ){
|
|
|
|
vs.set({
|
|
|
|
'focus:point:lat': clean['focus.point.lat'],
|
|
|
|
'focus:point:lon': clean['focus.point.lon']
|
|
|
|
});
|
feat(autocomplete) add hard distance filter to short focus.point queries
Short autocomplete inputs are very difficult to serve in a performant
and low-latency way. With shorter inputs, many more documents match for
just about any input string.
In our testing, one to three character input texts generally match up to
100 million documents out of a 560 million document full planet build.
There's really no way to make scoring 100 million documents fast,
so in order to achieve acceptable performance (ideally, <100ms P99
latency), it's worth looking at ways to either avoid querying
Elasticsearch or reducing the scope of autocomplete queries.
Short autocomplete queries without a focus.point parameter can be
cached. There are only 47,000 possible 1-3 character alphanumerical
inputs. At this time, caching is outside the scope of Pelias itself but
can easily be implemented with Varnish, Nginx, Fastly, Cloudfront, and
lots of other tools and services.
Queries with a `focus.point` are effectively uncachable however, since
the coordinate chosen will often be unique.
This PR uses the `focus.point` coordinate to build a
hard filter limiting the search to documents only within a certain
radius of the coordinate. This can reduce the number of documents
searched and improve performance, while still returning results that are
useful.
It takes two parameters, driven by `pelias-config`:
- `api.autocomplete.focusHardLimitTextLength': the maximum length of text
for which a hard distance filter will be constructed
- `api.autocomplete.focusHardLimitMultiplier`: the length of the input
text will be multiplied by this number to get the total hard filter
radius in kilometers.
For example, with `focusHardLimitTextLength` 4, and
`focusHardLimitMultiplier` 50, the following hard filters would be
constructed:
| text length | max distance |
| ---- | ----|
| 1 | 50 |
| 2 | 100 |
| 3 | 150 |
| 4+ | unlimited |
6 years ago
|
|
|
|
|
|
|
// search only near the focus.point for short inputs
|
|
|
|
// this reduces the numer of documents hit and keeps latency low
|
|
|
|
const hardLimitTextLength = config.get('api.autocomplete.focusHardLimitTextLength') || 0;
|
|
|
|
const distanceMultplier = config.get('api.autocomplete.focusHardLimitMultiplier') || 50;
|
|
|
|
if (clean.text.length < hardLimitTextLength) {
|
|
|
|
vs.set({
|
|
|
|
'boundary:circle:lat': clean['focus.point.lat'],
|
|
|
|
'boundary:circle:lon': clean['focus.point.lon'],
|
|
|
|
'boundary:circle:radius': `${50 * clean.text.length}km`
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// boundary rect
|
|
|
|
if( check.number(clean['boundary.rect.min_lat']) &&
|
|
|
|
check.number(clean['boundary.rect.max_lat']) &&
|
|
|
|
check.number(clean['boundary.rect.min_lon']) &&
|
|
|
|
check.number(clean['boundary.rect.max_lon']) ){
|
|
|
|
vs.set({
|
|
|
|
'boundary:rect:top': clean['boundary.rect.max_lat'],
|
|
|
|
'boundary:rect:right': clean['boundary.rect.max_lon'],
|
|
|
|
'boundary:rect:bottom': clean['boundary.rect.min_lat'],
|
|
|
|
'boundary:rect:left': clean['boundary.rect.min_lon']
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
// run the address parser
|
|
|
|
if( clean.parsed_text ){
|
|
|
|
textParser( clean.parsed_text, vs );
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
|
|
type: 'autocomplete',
|
|
|
|
body: query.render(vs)
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = generateQuery;
|