Browse Source

Merge pull request #1173 from pelias/configurable-boosts

Configurable boosts for sources and layers
pull/1224/head
Julian Simioni 6 years ago committed by GitHub
parent
commit
5a8977122b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 39
      README.md
  2. 3
      query/autocomplete.js
  3. 10
      query/autocomplete_defaults.js
  4. 4
      query/search.js
  5. 13
      query/search_defaults.js
  6. 3
      query/search_original.js
  7. 121
      query/view/boost_sources_and_layers.js
  8. 4
      schema.js
  9. 106
      test/unit/fixture/autocomplete_custom_boosts.json
  10. 108
      test/unit/fixture/search_with_custom_boosts.json
  11. 12
      test/unit/query/autocomplete.js
  12. 54
      test/unit/query/autocomplete_with_custom_boosts.js
  13. 12
      test/unit/query/search_original.js
  14. 53
      test/unit/query/search_with_custom_boosts.js
  15. 119
      test/unit/query/view/boost_sources_and_layers.js
  16. 7
      test/unit/run.js

39
README.md

@ -39,13 +39,14 @@ The API recognizes the following properties under the top-level `api` key in you
|parameter|required|default|description|
|---|---|---|---|
|`indexName`|*no*|*pelias*|name of the Elasticsearch index to be used when building queries|
|`relativeScores`|*no*|true|if set to true, confidence scores will be normalized, realistically at this point setting this to false is not tested or desirable
|`accessLog`|*no*||name of the format to use for access logs; may be any one of the [predefined values](https://github.com/expressjs/morgan#predefined-formats) in the `morgan` package. Defaults to `"common"`; if set to `false`, or an otherwise falsy value, disables access-logging entirely.|
|`services`|*no*||service definitions for [point-in-polygon](https://github.com/pelias/pip-service), [libpostal](https://github.com/whosonfirst/go-whosonfirst-libpostal), [placeholder](https://github.com/pelias/placeholder), and [interpolation](https://github.com/pelias/interpolation) services. If missing (which is not recommended), the services will not be called.|
|`defaultParameters.focus.point.lon` <br> `defaultParameters.focus.point.lat`|no | |default coordinates for focus point
|`targets.layers_by_source` <br> `targets.source_aliases` <br> `targets.layer_aliases`|no | |custom values for which `sources` and `layers` the API accepts ([more info](https://github.com/pelias/api/pull/1131)).
|`customBoosts` | no | `{}` | Allows configuring boosts for specific sources and layers, in order to influence result order. See [Configurable Boosts](#custom-boosts) below for details |
|`indexName`|*no*|*pelias*|name of the Elasticsearch index to be used when building queries|
|`attributionURL`|no| (autodetected)|The full URL to use for the attribution link returned in all Pelias responses. Pelias will attempt to autodetect this host, but it will often be correct if, for example, there is a proxy between Pelias and its users. This parameter allows setting a specific URL to avoid any such issues|
|`accessLog`|*no*||name of the format to use for access logs; may be any one of the [predefined values](https://github.com/expressjs/morgan#predefined-formats) in the `morgan` package. Defaults to `"common"`; if set to `false`, or an otherwise falsy value, disables access-logging entirely.|
|`relativeScores`|*no*|true|if set to true, confidence scores will be normalized, realistically at this point setting this to false is not tested or desirable
A good starting configuration file includes this section (fill in the service and Elasticsearch hosts as needed):
@ -82,6 +83,38 @@ A good starting configuration file includes this section (fill in the service an
The `timeout` and `retry` values, as show in in the `pip` service section, are optional but configurable for all services (see [pelias/microservice-wrapper](https://github.com/pelias/microservice-wrapper) for more details).
### Custom Boosts
The `customBoosts` config section allows influencing the sorting of results returned from most Pelias queries. Every Pelias record has a `source` and `layer` value, and this section allows prioritizing certain `sources` and `layers`.
First, keep in mind:
1. This will not affect _all_ Pelias queries. In particular, when using the `/v1/search` endpoint, queries for administrative areas (cities, countries, etc) will likely not be affected
2. Custom boosts allow _influencing_ results, but not completely controlling them. Very good matches that aren't in a boosted `source` or `layer` may still be returned first.
The basic form of the configuration looks like this:
```js
{
"api":
"customBoosts": {
"layer": {
"layername": 5,
"layername2": 3
},
"source": {
"sourcename": 5
}
}
}
}
```
There are subsections for both `layer` and `source`, and each subsection must be an object. Keys in those objects represent the sources and layers to be boosted, and the value associated with those keys must be a numeric value.
Boost values are essentially multipliers, so values greater than `1` will cause a source or layer to be returned more often, and higher in results. Boosts of the value `1` are the same as no boost, and boosts between `0` and `1` will de-prioritize matching records.
Recommended boost values are between 1 and 5. Higher boosts are likely to cause unexpected impact without really improving results much.
## Configuration via Environment variable
Most Pelias configuration is done through pelias-config, however the API has additional environment variables that affect its operation:

3
query/autocomplete.js

@ -3,9 +3,11 @@ const defaults = require('./autocomplete_defaults');
const textParser = require('./text_parser_addressit');
const check = require('check-types');
const logger = require('pelias-logger').get('api');
const config = require('pelias-config').generate().api;
// additional views (these may be merged in to pelias/query at a later date)
var views = {
custom_boosts: require('./view/boost_sources_and_layers'),
ngrams_strict: require('./view/ngrams_strict'),
ngrams_last_token_only: require('./view/ngrams_last_token_only'),
phrase_first_tokens_only: require('./view/phrase_first_tokens_only'),
@ -43,6 +45,7 @@ query.score( views.boost_exact_matches );
query.score( peliasQuery.view.focus( views.ngrams_strict ) );
query.score( peliasQuery.view.popularity( views.pop_subquery ) );
query.score( peliasQuery.view.population( views.pop_subquery ) );
query.score( views.custom_boosts( config.customBoosts ) );
// non-scoring hard filters
query.filter( peliasQuery.view.sources );

10
query/autocomplete_defaults.js

@ -91,6 +91,12 @@ module.exports = _.merge({}, peliasQuery.defaults, {
'population:field': 'population',
'population:modifier': 'log1p',
'population:max_boost': 20,
'population:weight': 3
'population:weight': 3,
// boost_sources_and_layers view
'custom:boosting:min_score': 1, // score applied to documents which don't score anything via functions
'custom:boosting:boost': 5, // multiply score by this number to increase the strength of the boost
'custom:boosting:max_boost': 50, // maximum boosting which can be applied (max_boost/boost = max_score)
'custom:boosting:score_mode': 'sum', // sum all function scores before multiplying the boost
'custom:boosting:boost_mode': 'multiply' // this mode is not relevant because there is no query section
});

4
query/search.js

@ -166,11 +166,11 @@ function isPostalCodeWithCountry(vs) {
var isSet = (layer) => {
return vs.isset(`input:${layer}`);
};
var allowedFields = ['postcode', 'country'];
var disallowedFields = ['query', 'category', 'housenumber', 'street', 'locality',
'neighbourhood', 'borough', 'county', 'region'];
return allowedFields.every(isSet) &&
!disallowedFields.some(isSet);
}

13
query/search_defaults.js

@ -93,7 +93,16 @@ module.exports = _.merge({}, peliasQuery.defaults, {
'population:max_boost': 20,
'population:weight': 2,
// used by fallback queries
// @todo: it is also possible to specify layer boosting
// via pelias/config, consider deprecating this config.
'boost:address': 10,
'boost:street': 5
'boost:street': 5,
// boost_sources_and_layers view
'custom:boosting:min_score': 1, // score applied to documents which don't score anything via functions
'custom:boosting:boost': 5, // multiply score by this number to increase the strength of the boost
'custom:boosting:max_boost': 50, // maximum boosting which can be applied (max_boost/boost = max_score)
'custom:boosting:score_mode': 'sum', // sum all function scores before multiplying the boost
'custom:boosting:boost_mode': 'multiply' // this mode is not relevant because there is no query section
});

3
query/search_original.js

@ -3,8 +3,10 @@ const defaults = require('./search_defaults');
const textParser = require('./text_parser_addressit');
const check = require('check-types');
const logger = require('pelias-logger').get('api');
const config = require('pelias-config').generate().api;
var placeTypes = require('../helper/placeTypes');
var views = { custom_boosts: require('./view/boost_sources_and_layers') };
// region_a is also an admin field. addressit tries to detect
// region_a, in which case we use a match query specifically for it.
@ -38,6 +40,7 @@ query.score( peliasQuery.view.address('postcode') );
query.score( peliasQuery.view.admin('country_a') );
query.score( peliasQuery.view.admin('region_a') );
query.score( peliasQuery.view.admin_multi_match(adminFields, 'peliasAdmin') );
query.score( views.custom_boosts( config.customBoosts ) );
// non-scoring hard filters
query.filter( peliasQuery.view.boundary_circle );

121
query/view/boost_sources_and_layers.js

@ -0,0 +1,121 @@
/**
This view allows users to specify a custom boost for sources and layers.
The view is implemented using a 'function_score' query, which enumerates multiple 'functions', each
function will assign a 'score' to each document when matched.
A document can match more than one function, in this case the 'score_mode' is used to decide how these
scores are combined, the default is 'sum'.
Likewise, a document can also match zero functions, in this case it is assigned a score of 'min_score'.
The computed score is then multiplied by the 'boost' value in order to come up with the final boost value
which will be assigned to that document. The 'boost' value is essentially a hard-coded multiplier for the score.
The 'max_boost' property is simply a ceiling for this computed boost, if the computed boosted is higher than
max_boost it will be assigned the value of max_boost instead.
Note: This is a simple use of the 'function_score' query, as such we don't use the 'boost_mode' property
(because there is no query section) and the 'weight' values we assign are simply returned verbatim
(because we use filter queries for the function scoring).
ref: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html
example config section:
{
"source": {
"openstreetmap": 5
},
"layer": {
"street": 3,
"country": 5
}
}
example query:
{
"function_score": {
"query": {
"match_all": {}
},
"functions": [{
"filter": {
"match": {
"layer": "intersections"
}
},
"weight": 1.6
},{
"filter": {
"match": {
"layer": "stops"
}
},
"weight": 2.4
}],
"boost": 5,
"max_boost": 40,
"score_mode": "sum",
"boost_mode": "multiply",
"min_score": 1
}
}
**/
// supported top-level config items
const TARGETS = ['source', 'layer'];
module.exports = function( config ) {
// no valid config to use, fail now, don't render this view.
if( !config ) { return function(){ return null; }; }
return function( vs ) {
// validate required params
if( !vs.isset('custom:boosting:min_score') ||
!vs.isset('custom:boosting:boost') ||
!vs.isset('custom:boosting:max_boost') ||
!vs.isset('custom:boosting:score_mode') ||
!vs.isset('custom:boosting:boost_mode') ){
return null;
}
// base 'function_score' view
var view = {
'function_score': {
'query': { 'match_all': {} }, // apply to all documents
'functions': [], // a list of functions which contribute to a 'score' for each document
'min_score': vs.var('custom:boosting:min_score'),
'boost': vs.var('custom:boosting:boost'),
'max_boost': vs.var('custom:boosting:max_boost'),
'score_mode': vs.var('custom:boosting:score_mode'),
'boost_mode': vs.var('custom:boosting:boost_mode')
},
};
// iterate over supported targets and their values
TARGETS.forEach( function( target ) {
if( 'object' === typeof config[target] ) {
Object.keys(config[target]).forEach(function(value) {
// add a scoring function for this target, assigning a weight
let weight = config[target][value];
view.function_score.functions.push({
'weight': isNaN(weight) ? 1 : weight,
'filter': {
'match': {
[target]: value
}
}
});
});
}
});
// no functions were generated, fail now, don't render this view.
if( view.function_score.functions.length === 0 ) { return null; }
return view;
};
};

4
schema.js

@ -20,6 +20,10 @@ module.exports = Joi.object().keys({
accessLog: Joi.string(),
relativeScores: Joi.boolean(),
requestRetries: Joi.number().integer().min(0),
customBoosts: Joi.object().keys({
layer: Joi.object(),
source: Joi.object()
}),
localization: Joi.object().keys({
flipNumberAndStreetCountries: Joi.array().items(Joi.string().regex(/^[A-Z]{3}$/))
}).unknown(false),

106
test/unit/fixture/autocomplete_custom_boosts.json

@ -0,0 +1,106 @@
{
"type": "autocomplete",
"body": {
"query": {
"bool": {
"must": [
{
"match": {
"name.default": {
"analyzer": "peliasQueryFullToken",
"type": "phrase",
"boost": 1,
"slop": 3,
"query": "foo"
}
}
}
],
"should": [
{
"match": {
"phrase.default": {
"analyzer": "peliasPhrase",
"type": "phrase",
"boost": 1,
"slop": 3,
"query": "foo"
}
}
},
{
"function_score": {
"query": {
"match_all": {}
},
"max_boost": 20,
"functions": [
{
"field_value_factor": {
"modifier": "log1p",
"field": "popularity",
"missing": 1
},
"weight": 1
}
],
"score_mode": "first",
"boost_mode": "replace"
}
},
{
"function_score": {
"query": {
"match_all": {}
},
"max_boost": 20,
"functions": [
{
"field_value_factor": {
"modifier": "log1p",
"field": "population",
"missing": 1
},
"weight": 3
}
],
"score_mode": "first",
"boost_mode": "replace"
}
},{
"function_score": {
"query": {
"match_all": {}
},
"min_score": 1,
"boost": 5,
"max_boost": 50,
"score_mode": "sum",
"boost_mode": "multiply",
"functions": [{
"filter": {
"match": {
"source": "openstreetmap"
}
},
"weight": 5
},{
"filter": {
"match": {
"layer": "transit"
}
},
"weight": 3
}]
}
}
]
}
},
"size": 20,
"track_scores": true,
"sort": [
"_score"
]
}
}

108
test/unit/fixture/search_with_custom_boosts.json

@ -0,0 +1,108 @@
{
"type": "search_original",
"body": {
"query": {
"bool": {
"must": [{
"match": {
"name.default": {
"query": "test",
"boost": 1,
"analyzer": "peliasQueryFullToken"
}
}
}],
"should": [{
"match": {
"phrase.default": {
"query": "test",
"analyzer": "peliasPhrase",
"type": "phrase",
"boost": 1,
"slop": 2
}
}
},{
"function_score": {
"query": {
"match": {
"phrase.default": {
"query": "test",
"analyzer": "peliasPhrase",
"type": "phrase",
"slop": 2,
"boost": 1
}
}
},
"max_boost": 20,
"score_mode": "first",
"boost_mode": "replace",
"functions": [{
"field_value_factor": {
"modifier": "log1p",
"field": "popularity",
"missing": 1
},
"weight": 1
}]
}
},{
"function_score": {
"query": {
"match": {
"phrase.default": {
"query": "test",
"analyzer": "peliasPhrase",
"type": "phrase",
"slop": 2,
"boost": 1
}
}
},
"max_boost": 20,
"score_mode": "first",
"boost_mode": "replace",
"functions": [{
"field_value_factor": {
"modifier": "log1p",
"field": "population",
"missing": 1
},
"weight": 2
}]
}
},{
"function_score": {
"query": {
"match_all": {}
},
"min_score": 1,
"boost": 5,
"max_boost": 50,
"score_mode": "sum",
"boost_mode": "multiply",
"functions": [{
"filter": {
"match": {
"source": "openstreetmap"
}
},
"weight": 5
},{
"filter": {
"match": {
"layer": "transit"
}
},
"weight": 3
}]
}
}]
}
},
"sort": [ "_score" ],
"size": 10,
"track_scores": true
}
}

12
test/unit/query/autocomplete.js

@ -1,4 +1,14 @@
var generate = require('../../../query/autocomplete');
const proxyquire = require('proxyquire').noCallThru();
const realPeliasConfig = require('pelias-config');
const defaultPeliasConfig = {
generate: function() {
return realPeliasConfig.defaults;
}
};
var generate = proxyquire('../../../query/autocomplete', {
'pelias-config': defaultPeliasConfig
});
module.exports.tests = {};

54
test/unit/query/autocomplete_with_custom_boosts.js

@ -0,0 +1,54 @@
const proxyquire = require('proxyquire');
module.exports.tests = {};
module.exports.tests.query = function(test, common) {
test('valid autocomplete with custom boosts', function(t) {
const clean = {
tokens: ['foo'],
tokens_complete: ['foo'],
tokens_incomplete: [],
text: 'test',
querySize: 10
};
const config_with_boosts = {
generate: function() {
return {
api: {
customBoosts: {
source: {
openstreetmap: 5
},
layer: {
transit: 3
}
}
}
};
}
};
var expected_query = require('../fixture/autocomplete_custom_boosts.json');
const autocomplete_query_module = proxyquire('../../../query/autocomplete', {
'pelias-config': config_with_boosts
});
const actual_query = JSON.parse( JSON.stringify( autocomplete_query_module(clean) ) );
t.deepEqual(actual_query, expected_query, 'query as expected');
t.pass();
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('autocomplete with custom boosts query ' + name, testFunction);
}
for( var testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};

12
test/unit/query/search_original.js

@ -1,4 +1,14 @@
var generate = require('../../../query/search_original');
const proxyquire = require('proxyquire').noCallThru();
const realPeliasConfig = require('pelias-config');
const defaultPeliasConfig = {
generate: function() {
return realPeliasConfig.defaults;
}
};
var generate = proxyquire('../../../query/search_original', {
'pelias-config': defaultPeliasConfig
});
module.exports.tests = {};

53
test/unit/query/search_with_custom_boosts.js

@ -0,0 +1,53 @@
const proxyquire = require('proxyquire');
module.exports.tests = {};
module.exports.tests.query = function(test, common) {
test('valid search with custom boosts', function(t) {
const clean = {
tokens: ['foo'],
tokens_complete: ['foo'],
tokens_incomplete: [],
text: 'test',
querySize: 10
};
const config_with_boosts = {
generate: function() {
return {
api: {
customBoosts: {
source: {
openstreetmap: 5
},
layer: {
transit: 3
}
}
}
};
}
};
var expected_query = require('../fixture/search_with_custom_boosts.json');
const search_query_module = proxyquire('../../../query/search_original', {
'pelias-config': config_with_boosts
});
const actual_query = JSON.parse( JSON.stringify( search_query_module(clean) ) );
t.deepEqual(actual_query, expected_query, 'query as expected');
t.pass();
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('search with custom boosts query ' + name, testFunction);
}
for( var testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};

119
test/unit/query/view/boost_sources_and_layers.js

@ -0,0 +1,119 @@
const query = require('pelias-query');
const vs = new query.Vars(require('../../../../query/search_defaults'));
const boost_sources_and_layers = require('../../../../query/view/boost_sources_and_layers');
module.exports.tests = {};
module.exports.tests.empty_config = function(test, common) {
test('empty configuration returns empty query', function(t) {
const view = boost_sources_and_layers({});
const rendered = view(vs);
t.equal(rendered, null, 'query is empty');
t.end();
});
test('undefined configuration returns empty query', function(t) {
const view = boost_sources_and_layers(undefined);
const rendered = view(vs);
t.equal(rendered, null, 'query is empty');
t.end();
});
};
module.exports.tests.single_item_config = function(test, common) {
test('config with single layer entry produces a single scoring function with weight', function(t) {
const config = {
layer: {
locality: 5
}
};
const expected_query = {
'function_score': {
'query': {
'match_all': {}
},
'functions': [{
'filter': {
'match': {
'layer': 'locality'
}
},
'weight': 5
}],
'boost': vs.var('custom:boosting:boost'),
'max_boost': vs.var('custom:boosting:max_boost'),
'score_mode': vs.var('custom:boosting:score_mode'),
'boost_mode': vs.var('custom:boosting:boost_mode'),
'min_score': vs.var('custom:boosting:min_score')
}
};
const view = boost_sources_and_layers(config);
t.deepEquals(view(vs), expected_query, 'query contains a single scoring function');
t.end();
});
};
module.exports.tests.mulitple_item_config = function(test, common) {
test('config with multiple items produces multiple scoring functions', function(t) {
const config = {
source: {
whosonfirst: 6
},
layer: {
country: 2,
borough: 0.5
},
};
const expected_query = {
'function_score': {
'query': {
'match_all': {}
},
'functions': [{
'filter': {
'match': {
'source': 'whosonfirst'
}
},
'weight': 6
},{
'filter': {
'match': {
'layer': 'country'
}
},
'weight': 2
},{
'filter': {
'match': {
'layer': 'borough'
}
},
'weight': 0.5
}],
'boost': vs.var('custom:boosting:boost'),
'max_boost': vs.var('custom:boosting:max_boost'),
'score_mode': vs.var('custom:boosting:score_mode'),
'boost_mode': vs.var('custom:boosting:boost_mode'),
'min_score': vs.var('custom:boosting:min_score')
}
};
const view = boost_sources_and_layers(config);
t.deepEquals(view(vs), expected_query, 'query contains multiple scoring functions');
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('boost sources and layers ' + name, testFunction);
}
for( var testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};

7
test/unit/run.js

@ -64,13 +64,16 @@ var tests = [
require('./query/address_search_using_ids'),
require('./query/autocomplete'),
require('./query/autocomplete_defaults'),
require('./query/search_defaults'),
require('./query/reverse_defaults'),
require('./query/autocomplete_with_custom_boosts'),
require('./query/reverse'),
require('./query/reverse_defaults'),
require('./query/search'),
require('./query/search_with_custom_boosts'),
require('./query/search_defaults'),
require('./query/search_original'),
require('./query/structured_geocoding'),
require('./query/text_parser'),
require('./query/view/boost_sources_and_layers'),
require('./sanitizer/_boundary_country'),
require('./sanitizer/_debug'),
require('./sanitizer/_flag_bool'),

Loading…
Cancel
Save