Browse Source

feat(inter_layer_deduping): dedupe between layers and prefer some layers over others

inter_layer_deduping
Peter Johnson 6 years ago
parent
commit
15a3d4f489
  1. 4
      helper/diffPlaces.js
  2. 35
      middleware/dedupe.js
  3. 140
      test/unit/middleware/dedupe.js

4
helper/diffPlaces.js

@ -14,6 +14,10 @@ function isLayerDifferent(item1, item2){
( item2.layer === 'venue' || !_.includes( canonicalLayers, item2.layer ) ) ){
return false;
}
// consider some layers to be synonymous
if( _.includes( placeTypes, item1.layer ) && _.includes( placeTypes, item2.layer ) ){
return false;
}
return true;
}
return false;

35
middleware/dedupe.js

@ -4,6 +4,10 @@ const isDifferent = require('../helper/diffPlaces').isDifferent;
const canonical_sources = require('../helper/type_mapping').canonical_sources;
const field = require('../helper/fieldValue');
// when performing inter-layer deduping, layers coming earlier in this list take
// preference to those appearing later or not at all.
const layerPreferences = [ 'locality', 'country', 'localadmin', 'region', 'neighbourhood' ];
function dedupeResults(req, res, next) {
// do nothing if request data is invalid
@ -79,19 +83,28 @@ function isPreferred(existingHit, candidateHit) {
if( !_.includes(canonical_sources, candidateHit.source) &&
_.includes(canonical_sources, existingHit.source) ){ return true; }
// prefer certain layers over others
if( existingHit.layer !== candidateHit.layer && _.isArray( layerPreferences ) ){
for( let i=0; i<layerPreferences.length; i++ ){
if( existingHit.layer === layerPreferences[i] ){ return false; }
if( candidateHit.layer === layerPreferences[i] ){ return true; }
}
}
// prefer certain sources over others
switch( existingHit.source ){
// sources are the same
case candidateHit.source: return false;
// WOF has bbox and is generally preferred
case 'geonames': return candidateHit.source === 'whosonfirst';
// addresses are generally better in OA
case 'openstreetmap': return candidateHit.source === 'openaddresses';
// venues are better in OSM
case 'whosonfirst': return candidateHit.source === 'openstreetmap';
// no preference, keep existing hit
default: return false;
if( existingHit.source !== candidateHit.source ){
switch( existingHit.source ){
// WOF has bbox and is generally preferred
case 'geonames': return candidateHit.source === 'whosonfirst';
// addresses are generally better in OA
case 'openstreetmap': return candidateHit.source === 'openaddresses';
// venues are better in OSM
case 'whosonfirst': return candidateHit.source === 'openstreetmap';
}
}
// no preference, keep existing hit
return false;
}
module.exports = function() {

140
test/unit/middleware/dedupe.js

@ -327,6 +327,146 @@ module.exports.tests.priority = function(test, common) {
t.equal(res.data.length, 1, 'results have fewer items than before');
t.end();
});
test('locality takes priority over country, replace', function (t) {
var req = {
clean: {
text: 'Singapore',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': 'Singapore' },
'source': 'whosonfirst',
'source_id': '123456',
'layer': 'country'
},
{
'name': { 'default': 'Singapore' },
'source': 'whosonfirst',
'source_id': '654321',
'layer': 'locality'
}
]
};
var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].layer, 'locality', 'locality result won');
t.end();
});
});
test('locality takes priority over county, replace', function (t) {
var req = {
clean: {
text: 'Auckland',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': 'Auckland' },
'source': 'whosonfirst',
'source_id': '123456',
'layer': 'county'
},
{
'name': { 'default': 'Auckland' },
'source': 'whosonfirst',
'source_id': '654321',
'layer': 'locality'
}
]
};
var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].layer, 'locality', 'locality result won');
t.end();
});
});
test('localadmin takes priority over region, replace', function (t) {
var req = {
clean: {
text: 'Bern',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': 'Bern' },
'source': 'whosonfirst',
'source_id': '123456',
'layer': 'region'
},
{
'name': { 'default': 'Bern' },
'source': 'whosonfirst',
'source_id': '654321',
'layer': 'localadmin'
}
]
};
var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].layer, 'localadmin', 'localadmin result won');
t.end();
});
});
test('locality takes priority over county, neighbourhood and localadmin, replace', function (t) {
var req = {
clean: {
text: 'Parramatta',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': 'Parramatta' },
'source': 'whosonfirst',
'source_id': '123456',
'layer': 'county'
},
{
'name': { 'default': 'Parramatta' },
'source': 'whosonfirst',
'source_id': '7890',
'layer': 'neighbourhood'
},
{
'name': { 'default': 'Parramatta' },
'source': 'whosonfirst',
'source_id': '0987',
'layer': 'localadmin'
},
{
'name': { 'default': 'Parramatta' },
'source': 'whosonfirst',
'source_id': '654321',
'layer': 'locality'
}
]
};
var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].layer, 'locality', 'locality result won');
t.end();
});
});
};
module.exports.all = function (tape, common) {

Loading…
Cancel
Save