From f1c5e8d9a8c52ff944180de3fee7bfc2aedba22b Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Mon, 29 Oct 2018 19:39:36 +0100 Subject: [PATCH] feat(dedupe): treat all non-canonical layers and analogous to a venue, prefer non-canonical records --- helper/TypeMapping.js | 19 +++++++++++ helper/diffPlaces.js | 11 +++++- middleware/dedupe.js | 5 +++ ...dupe_elasticsearch_custom_layer_results.js | 32 +++++++++++++++++ test/unit/middleware/dedupe.js | 34 +++++++++++++++---- 5 files changed, 93 insertions(+), 8 deletions(-) create mode 100644 test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js diff --git a/helper/TypeMapping.js b/helper/TypeMapping.js index 97d87a26..7f277c3e 100644 --- a/helper/TypeMapping.js +++ b/helper/TypeMapping.js @@ -1,6 +1,9 @@ const _ = require('lodash'); const elasticsearch = require('elasticsearch'); +// a list of the canonical sources included in the default Pelias configuration +const CANONICAL_SOURCES = ['whosonfirst', 'openstreetmap', 'openaddresses', 'geonames']; + var TypeMapping = function(){ // A list of all sources @@ -75,6 +78,22 @@ TypeMapping.prototype.generateMappings = function(){ this.layer_mapping = TypeMapping.addStandardTargetsToAliases(this.layers, this.layer_aliases); }; +// return a list of all sources which are part of the canonical Pelias configuration +TypeMapping.prototype.getCanonicalSources = function(){ + return CANONICAL_SOURCES; +}; + +// generate a list of all layers which are part of the canonical Pelias configuration +TypeMapping.prototype.getCanonicalLayers = function(){ + var canonicalLayers = []; + for( var source in this.layers_by_source ){ + if( _.includes( CANONICAL_SOURCES, source ) ){ + canonicalLayers = _.uniq( canonicalLayers.concat( this.layers_by_source[source] ) ); + } + } + return canonicalLayers; +}; + // load values from targets block TypeMapping.prototype.loadTargets = function( targetsBlock ){ diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index ea15bd79..0f327bfb 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -1,5 +1,6 @@ const _ = require('lodash'); const placeTypes = require('./placeTypes'); +const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers(); const field = require('../helper/fieldValue'); /** @@ -7,7 +8,15 @@ const field = require('../helper/fieldValue'); * Returns false if the objects are the same, else true. */ function isLayerDifferent(item1, item2){ - return isPropertyDifferent(item1, item2, 'layer'); + if( isPropertyDifferent(item1, item2, 'layer') ){ + // consider all custom layers to be analogous to a venue + if( ( item1.layer === 'venue' || !_.includes( canonicalLayers, item1.layer ) ) && + ( item2.layer === 'venue' || !_.includes( canonicalLayers, item2.layer ) ) ){ + return false; + } + return true; + } + return false; } /** diff --git a/middleware/dedupe.js b/middleware/dedupe.js index ba9e70e7..175c5887 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -1,6 +1,7 @@ const logger = require('pelias-logger').get('api'); const _ = require('lodash'); const isDifferent = require('../helper/diffPlaces').isDifferent; +const canonicalSources = require('../helper/type_mapping').getCanonicalSources(); const field = require('../helper/fieldValue'); function dedupeResults(req, res, next) { @@ -74,6 +75,10 @@ function isPreferred(existingHit, candidateHit) { if( !_.has(existingHit, 'address_parts.zip') && _.has(candidateHit, 'address_parts.zip') ){ return true; } + // prefer non-canonical sources over canonical ones + if( !_.includes(canonicalSources, candidateHit.source) && + _.includes(canonicalSources, existingHit.source) ){ return true; } + // prefer certain sources over others switch( existingHit.source ){ // sources are the same diff --git a/test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js b/test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js new file mode 100644 index 00000000..4c1f5f58 --- /dev/null +++ b/test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js @@ -0,0 +1,32 @@ +module.exports = [ + { + '_id': '101914069', + 'layer': 'venue', + 'source': 'openstreetmap', + 'name': { + 'default': 'Nike World Headquarters' + }, + 'parent': { + 'country_a': ['USA'], + 'country': ['United States'], + 'region': ['Oregon'], + 'region_id': ['85688513'] + }, + 'confidence': 0.98 + }, + { + '_id': '2456::trimet::major_employer', + 'layer': 'major_employer', + 'source': 'transit', + 'name': { + 'default': 'Nike World Headquarters' + }, + 'parent': { + 'country_a': ['USA'], + 'country': ['United States'], + 'region': ['Oregon'], + 'region_id': ['85688513'] + }, + 'confidence': 0.50 + } +]; \ No newline at end of file diff --git a/test/unit/middleware/dedupe.js b/test/unit/middleware/dedupe.js index 9b4d04c3..674a2b10 100644 --- a/test/unit/middleware/dedupe.js +++ b/test/unit/middleware/dedupe.js @@ -1,5 +1,6 @@ var data = require('../fixture/dedupe_elasticsearch_results'); var nonAsciiData = require('../fixture/dedupe_elasticsearch_nonascii_results'); +var customLayerData = require('../fixture/dedupe_elasticsearch_custom_layer_results'); var dedupe = require('../../../middleware/dedupe')(); module.exports.tests = {}; @@ -56,10 +57,29 @@ module.exports.tests.dedupe = function(test, common) { t.end(); }); }); + + test('deduplicate between custom layers and venue layers', function(t) { + var req = { + clean: { + size: 20 + } + }; + var res = { + data: customLayerData + }; + var expected = customLayerData[1]; // non-canonical record + + dedupe(req, res, function () { + t.equal(res.data.length, 1, 'only one result displayed'); + t.equal(res.data[0], expected, 'non-canonical data is preferred'); + t.end(); + }); + }); }; -module.exports.tests.trump = function(test, common) { - test('whosonfirst trumps geonames, replace', function (t) { + +module.exports.tests.priority = function(test, common) { + test('whosonfirst takes priority over geonames, replace', function (t) { var req = { clean: { text: 'Lancaster', @@ -91,7 +111,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('whosonfirst trumps geonames, no replace', function (t) { + test('whosonfirst takes priority over geonames, no replace', function (t) { var req = { clean: { text: 'Lancaster', @@ -123,7 +143,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('openstreetmap trumps whosonfirst venues', function (t) { + test('openstreetmap takes priority over whosonfirst venues', function (t) { var req = { clean: { text: 'Lancaster Dairy Farm', @@ -155,7 +175,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('openaddresses trumps openstreetmap', function (t) { + test('openaddresses takes priority over openstreetmap', function (t) { var req = { clean: { text: '100 Main St', @@ -187,7 +207,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('openaddresses with zip trumps openaddresses without zip', function (t) { + test('openaddresses with zip takes priority over openaddresses without zip', function (t) { var req = { clean: { text: '100 Main St', @@ -223,7 +243,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('osm with zip trumps openaddresses without zip', function (t) { + test('osm with zip takes priority over openaddresses without zip', function (t) { var req = { clean: { text: '100 Main St',