Browse Source

feat(dedupe): treat all non-canonical layers and analogous to a venue, prefer non-canonical records

pull/1222/head
Peter Johnson 6 years ago committed by Julian Simioni
parent
commit
c0a0663e21
No known key found for this signature in database
GPG Key ID: B9EEB0C6EE0910A1
  1. 19
      helper/TypeMapping.js
  2. 11
      helper/diffPlaces.js
  3. 5
      middleware/dedupe.js
  4. 32
      test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js
  5. 34
      test/unit/middleware/dedupe.js

19
helper/TypeMapping.js

@ -1,6 +1,9 @@
const _ = require('lodash'); const _ = require('lodash');
const elasticsearch = require('elasticsearch'); const elasticsearch = require('elasticsearch');
// a list of the canonical sources included in the default Pelias configuration
const CANONICAL_SOURCES = ['whosonfirst', 'openstreetmap', 'openaddresses', 'geonames'];
var TypeMapping = function(){ var TypeMapping = function(){
// A list of all sources // A list of all sources
@ -75,6 +78,22 @@ TypeMapping.prototype.generateMappings = function(){
this.layer_mapping = TypeMapping.addStandardTargetsToAliases(this.layers, this.layer_aliases); this.layer_mapping = TypeMapping.addStandardTargetsToAliases(this.layers, this.layer_aliases);
}; };
// return a list of all sources which are part of the canonical Pelias configuration
TypeMapping.prototype.getCanonicalSources = function(){
return CANONICAL_SOURCES;
};
// generate a list of all layers which are part of the canonical Pelias configuration
TypeMapping.prototype.getCanonicalLayers = function(){
var canonicalLayers = [];
for( var source in this.layers_by_source ){
if( _.includes( CANONICAL_SOURCES, source ) ){
canonicalLayers = _.uniq( canonicalLayers.concat( this.layers_by_source[source] ) );
}
}
return canonicalLayers;
};
// load values from targets block // load values from targets block
TypeMapping.prototype.loadTargets = function( targetsBlock ){ TypeMapping.prototype.loadTargets = function( targetsBlock ){

11
helper/diffPlaces.js

@ -1,5 +1,6 @@
const _ = require('lodash'); const _ = require('lodash');
const placeTypes = require('./placeTypes'); const placeTypes = require('./placeTypes');
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers();
const field = require('../helper/fieldValue'); const field = require('../helper/fieldValue');
/** /**
@ -7,7 +8,15 @@ const field = require('../helper/fieldValue');
* Returns false if the objects are the same, else true. * Returns false if the objects are the same, else true.
*/ */
function isLayerDifferent(item1, item2){ function isLayerDifferent(item1, item2){
return isPropertyDifferent(item1, item2, 'layer'); if( isPropertyDifferent(item1, item2, 'layer') ){
// consider all custom layers to be analogous to a venue
if( ( item1.layer === 'venue' || !_.includes( canonicalLayers, item1.layer ) ) &&
( item2.layer === 'venue' || !_.includes( canonicalLayers, item2.layer ) ) ){
return false;
}
return true;
}
return false;
} }
/** /**

5
middleware/dedupe.js

@ -1,6 +1,7 @@
const logger = require('pelias-logger').get('api'); const logger = require('pelias-logger').get('api');
const _ = require('lodash'); const _ = require('lodash');
const isDifferent = require('../helper/diffPlaces').isDifferent; const isDifferent = require('../helper/diffPlaces').isDifferent;
const canonicalSources = require('../helper/type_mapping').getCanonicalSources();
const field = require('../helper/fieldValue'); const field = require('../helper/fieldValue');
function dedupeResults(req, res, next) { function dedupeResults(req, res, next) {
@ -74,6 +75,10 @@ function isPreferred(existingHit, candidateHit) {
if( !_.has(existingHit, 'address_parts.zip') && if( !_.has(existingHit, 'address_parts.zip') &&
_.has(candidateHit, 'address_parts.zip') ){ return true; } _.has(candidateHit, 'address_parts.zip') ){ return true; }
// prefer non-canonical sources over canonical ones
if( !_.includes(canonicalSources, candidateHit.source) &&
_.includes(canonicalSources, existingHit.source) ){ return true; }
// prefer certain sources over others // prefer certain sources over others
switch( existingHit.source ){ switch( existingHit.source ){
// sources are the same // sources are the same

32
test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js

@ -0,0 +1,32 @@
module.exports = [
{
'_id': '101914069',
'layer': 'venue',
'source': 'openstreetmap',
'name': {
'default': 'Nike World Headquarters'
},
'parent': {
'country_a': ['USA'],
'country': ['United States'],
'region': ['Oregon'],
'region_id': ['85688513']
},
'confidence': 0.98
},
{
'_id': '2456::trimet::major_employer',
'layer': 'major_employer',
'source': 'transit',
'name': {
'default': 'Nike World Headquarters'
},
'parent': {
'country_a': ['USA'],
'country': ['United States'],
'region': ['Oregon'],
'region_id': ['85688513']
},
'confidence': 0.50
}
];

34
test/unit/middleware/dedupe.js

@ -1,5 +1,6 @@
var data = require('../fixture/dedupe_elasticsearch_results'); var data = require('../fixture/dedupe_elasticsearch_results');
var nonAsciiData = require('../fixture/dedupe_elasticsearch_nonascii_results'); var nonAsciiData = require('../fixture/dedupe_elasticsearch_nonascii_results');
var customLayerData = require('../fixture/dedupe_elasticsearch_custom_layer_results');
var dedupe = require('../../../middleware/dedupe')(); var dedupe = require('../../../middleware/dedupe')();
module.exports.tests = {}; module.exports.tests = {};
@ -56,10 +57,29 @@ module.exports.tests.dedupe = function(test, common) {
t.end(); t.end();
}); });
}); });
test('deduplicate between custom layers and venue layers', function(t) {
var req = {
clean: {
size: 20
}
}; };
var res = {
data: customLayerData
};
var expected = customLayerData[1]; // non-canonical record
dedupe(req, res, function () {
t.equal(res.data.length, 1, 'only one result displayed');
t.equal(res.data[0], expected, 'non-canonical data is preferred');
t.end();
});
});
};
module.exports.tests.trump = function(test, common) { module.exports.tests.priority = function(test, common) {
test('whosonfirst trumps geonames, replace', function (t) { test('whosonfirst takes priority over geonames, replace', function (t) {
var req = { var req = {
clean: { clean: {
text: 'Lancaster', text: 'Lancaster',
@ -91,7 +111,7 @@ module.exports.tests.trump = function(test, common) {
}); });
}); });
test('whosonfirst trumps geonames, no replace', function (t) { test('whosonfirst takes priority over geonames, no replace', function (t) {
var req = { var req = {
clean: { clean: {
text: 'Lancaster', text: 'Lancaster',
@ -123,7 +143,7 @@ module.exports.tests.trump = function(test, common) {
}); });
}); });
test('openstreetmap trumps whosonfirst venues', function (t) { test('openstreetmap takes priority over whosonfirst venues', function (t) {
var req = { var req = {
clean: { clean: {
text: 'Lancaster Dairy Farm', text: 'Lancaster Dairy Farm',
@ -155,7 +175,7 @@ module.exports.tests.trump = function(test, common) {
}); });
}); });
test('openaddresses trumps openstreetmap', function (t) { test('openaddresses takes priority over openstreetmap', function (t) {
var req = { var req = {
clean: { clean: {
text: '100 Main St', text: '100 Main St',
@ -187,7 +207,7 @@ module.exports.tests.trump = function(test, common) {
}); });
}); });
test('openaddresses with zip trumps openaddresses without zip', function (t) { test('openaddresses with zip takes priority over openaddresses without zip', function (t) {
var req = { var req = {
clean: { clean: {
text: '100 Main St', text: '100 Main St',
@ -223,7 +243,7 @@ module.exports.tests.trump = function(test, common) {
}); });
}); });
test('osm with zip trumps openaddresses without zip', function (t) { test('osm with zip takes priority over openaddresses without zip', function (t) {
var req = { var req = {
clean: { clean: {
text: '100 Main St', text: '100 Main St',

Loading…
Cancel
Save