Browse Source

Merge pull request #687 from pelias/smart-dedupe

Rock-Paper-Scissors: check which record is preferred when dupe is found
pull/696/head v3.4.0
Diana Shkolnikov 8 years ago committed by GitHub
parent
commit
bcd25ceba4
  1. 49
      middleware/dedupe.js
  2. 130
      test/unit/middleware/dedupe.js

49
middleware/dedupe.js

@ -16,11 +16,41 @@ function dedupeResults(req, res, next) {
var uniqueResults = []; var uniqueResults = [];
_.some(res.data, function (hit) { _.some(res.data, function (hit) {
if (uniqueResults.length === 0 || _.every(uniqueResults, isDifferent.bind(null, hit)) ) {
if (_.isEmpty(uniqueResults)) {
uniqueResults.push(hit); uniqueResults.push(hit);
} }
else { else {
logger.info('[dupe]', { query: req.clean.text, hit: hit.name.default + ' ' + hit.source + ':' + hit._id }); // if there are multiple items in results, loop through them to find a dupe
// save off the index of the dupe if found
var dupeIndex = uniqueResults.findIndex(function (elem, index, array) {
return !isDifferent(elem, hit);
});
// if a dupe is not found, just add to results and move on
if (dupeIndex === -1) {
uniqueResults.push(hit);
}
// if dupe was found, we need to check which of the records is preferred
// since the order in which Elasticsearch returns identical text matches is arbitrary
// of course, if the new one is preferred we should replace previous with new
else if (isPreferred(uniqueResults[dupeIndex], hit)) {
logger.info('[dupe][replacing]', {
query: req.clean.text,
previous: uniqueResults[dupeIndex].source,
hit: hit.name.default + ' ' + hit.source + ':' + hit._id
});
// replace previous dupe item with current hit
uniqueResults[dupeIndex] = hit;
}
// if not preferred over existing, just log and move on
else {
logger.info('[dupe][skipping]', {
query: req.clean.text,
previous: uniqueResults[dupeIndex].source,
hit: hit.name.default + ' ' + hit.source + ':' + hit._id
});
}
} }
// stop looping when requested size has been reached in uniqueResults // stop looping when requested size has been reached in uniqueResults
@ -32,4 +62,19 @@ function dedupeResults(req, res, next) {
next(); next();
} }
function isPreferred(existing, candidateReplacement) {
// NOTE: we are assuming here that the layer for both records is the same
//bind the trumps function to the data items to keep the rest of the function clean
var trumpsFunc = trumps.bind(null, existing, candidateReplacement);
return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred
trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA
trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time
}
function trumps(existing, candidateReplacement, loserSource, winnerSource) {
return existing.source === loserSource && candidateReplacement.source === winnerSource;
}
module.exports = setup; module.exports = setup;

130
test/unit/middleware/dedupe.js

@ -58,6 +58,136 @@ module.exports.tests.dedupe = function(test, common) {
}); });
}; };
module.exports.tests.trump = function(test, common) {
test('whosonfirst trumps geonames, replace', function (t) {
var req = {
clean: {
text: 'Lancaster',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': 'Lancaster' },
'source': 'geonames',
'source_id': '123456',
'layer': 'locality'
},
{
'name': { 'default': 'Lancaster' },
'source': 'whosonfirst',
'source_id': '654321',
'layer': 'locality'
}
]
};
var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].source, 'whosonfirst', 'whosonfirst result won');
t.end();
});
});
test('whosonfirst trumps geonames, no replace', function (t) {
var req = {
clean: {
text: 'Lancaster',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': 'Lancaster' },
'source': 'whosonfirst',
'source_id': '123456',
'layer': 'locality'
},
{
'name': { 'default': 'Lancaster' },
'source': 'geonames',
'source_id': '654321',
'layer': 'locality'
}
]
};
var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].source, 'whosonfirst', 'whosonfirst result won');
t.end();
});
});
test('openstreetmap trumps whosonfirst venues', function (t) {
var req = {
clean: {
text: 'Lancaster Dairy Farm',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': 'Lancaster Dairy Farm' },
'source': 'openstreetmap',
'source_id': '123456',
'layer': 'venue'
},
{
'name': { 'default': 'Lancaster Dairy Farm' },
'source': 'whosonfirst',
'source_id': '654321',
'layer': 'venue'
}
]
};
var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].source, 'openstreetmap', 'openstreetmap result won');
t.end();
});
});
test('openaddresses trumps openstreetmap', function (t) {
var req = {
clean: {
text: '100 Main St',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': '100 Main St' },
'source': 'openstreetmap',
'source_id': '123456',
'layer': 'address'
},
{
'name': { 'default': '100 Main St' },
'source': 'openaddresses',
'source_id': '654321',
'layer': 'address'
}
]
};
var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].source, 'openaddresses', 'openaddresses result won');
t.end();
});
});
};
module.exports.all = function (tape, common) { module.exports.all = function (tape, common) {
function test(name, testFunction) { function test(name, testFunction) {

Loading…
Cancel
Save