|
|
|
var logger = require('pelias-logger').get('api');
|
|
|
|
var _ = require('lodash');
|
|
|
|
var isDifferent = require('../helper/diffPlaces').isDifferent;
|
|
|
|
|
|
|
|
function setup() {
|
|
|
|
return dedupeResults;
|
|
|
|
}
|
|
|
|
|
|
|
|
function dedupeResults(req, res, next) {
|
|
|
|
// do nothing if no result data set
|
|
|
|
if (_.isUndefined(req.clean) || _.isUndefined(res) || _.isUndefined(res.data)) {
|
|
|
|
return next();
|
|
|
|
}
|
|
|
|
|
|
|
|
// loop through data items and only copy unique items to uniqueResults
|
|
|
|
var uniqueResults = [];
|
|
|
|
|
|
|
|
_.some(res.data, function (hit) {
|
|
|
|
|
|
|
|
if (_.isEmpty(uniqueResults)) {
|
|
|
|
uniqueResults.push(hit);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// if there are multiple items in results, loop through them to find a dupe
|
|
|
|
// save off the index of the dupe if found
|
|
|
|
var dupeIndex = uniqueResults.findIndex(function (elem, index, array) {
|
|
|
|
return !isDifferent(elem, hit);
|
|
|
|
});
|
|
|
|
|
|
|
|
// if a dupe is not found, just add to results and move on
|
|
|
|
if (dupeIndex === -1) {
|
|
|
|
uniqueResults.push(hit);
|
|
|
|
}
|
|
|
|
// if dupe was found, we need to check which of the records is preferred
|
|
|
|
// since the order in which Elasticsearch returns identical text matches is arbitrary
|
|
|
|
// of course, if the new one is preferred we should replace previous with new
|
|
|
|
else if (isPreferred(uniqueResults[dupeIndex], hit)) {
|
|
|
|
logger.info('[dupe][replacing]', {
|
|
|
|
query: req.clean.text,
|
|
|
|
previous: uniqueResults[dupeIndex].source,
|
|
|
|
hit: hit.name.default + ' ' + hit.source + ':' + hit._id
|
|
|
|
});
|
|
|
|
// replace previous dupe item with current hit
|
|
|
|
uniqueResults[dupeIndex] = hit;
|
|
|
|
}
|
|
|
|
// if not preferred over existing, just log and move on
|
|
|
|
else {
|
|
|
|
logger.info('[dupe][skipping]', {
|
|
|
|
query: req.clean.text,
|
|
|
|
previous: uniqueResults[dupeIndex].source,
|
|
|
|
hit: hit.name.default + ' ' + hit.source + ':' + hit._id
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// stop looping when requested size has been reached in uniqueResults
|
|
|
|
return req.clean.size <= uniqueResults.length;
|
|
|
|
});
|
|
|
|
|
|
|
|
res.data = uniqueResults;
|
|
|
|
|
|
|
|
next();
|
|
|
|
}
|
|
|
|
|
|
|
|
function isPreferred(existing, candidateReplacement) {
|
|
|
|
// NOTE: we are assuming here that the layer for both records is the same
|
|
|
|
|
|
|
|
var isOA = _.flow(_.property('source'), _.eq.bind(null, 'openaddresses'));
|
|
|
|
var hasZip = _.bind(_.has, null, _.bind.placeholder, 'address_parts.zip');
|
|
|
|
|
|
|
|
// https://github.com/pelias/api/issues/872
|
|
|
|
if (isOA(existing) && isOA(candidateReplacement)) {
|
|
|
|
return hasZip(candidateReplacement) && !hasZip(existing);
|
|
|
|
}
|
|
|
|
|
|
|
|
//bind the trumps function to the data items to keep the rest of the function clean
|
|
|
|
var trumpsFunc = trumps.bind(null, existing, candidateReplacement);
|
|
|
|
|
|
|
|
return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred
|
|
|
|
trumpsFunc('geonamesmil', 'whosonfirst') || // WOF has bbox and is generally preferred
|
|
|
|
trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA
|
|
|
|
trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time
|
|
|
|
}
|
|
|
|
|
|
|
|
function trumps(existing, candidateReplacement, loserSource, winnerSource) {
|
|
|
|
return existing.source === loserSource && candidateReplacement.source === winnerSource;
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = setup;
|