You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

99 lines
3.6 KiB

const logger = require('pelias-logger').get('api');
const _ = require('lodash');
const isDifferent = require('../helper/diffPlaces').isDifferent;
const canonicalSources = require('../helper/type_mapping').getCanonicalSources();
const field = require('../helper/fieldValue');
function dedupeResults(req, res, next) {
// do nothing if request data is invalid
if( _.isUndefined(res) || !_.isPlainObject(req.clean) ){ return next(); }
// do nothing if no result data is invalid
if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); }
// loop through data items and only copy unique items to unique
// note: the first reqults must always be unique!
let unique = [ res.data[0] ];
// convenience function to search unique array for an existing element which matches a hit
let findMatch = (hit) => unique.findIndex(elem => !isDifferent(elem, hit));
// iterate over res.data using an old-school for loop starting at index 1
// we can call break at any time to end the iterator
for( let i=1; i<res.data.length; i++){
let hit = res.data[i];
// if there are multiple items in results, loop through them to find a dupe
// save off the index of the dupe if found
let dupeIndex = findMatch(hit);
// if a dupe is not found, just add to list of unique hits and continue
if( dupeIndex === -1 ){ unique.push(hit); }
// if dupe was found, we need to check which of the records is preferred
// since the order in which Elasticsearch returns identical text matches is arbitrary
// of course, if the new one is preferred we should replace previous with new
else if( isPreferred( unique[dupeIndex], hit ) ) {
// replace previous dupe item with current hit
unique[dupeIndex] = hit;
// logging
logger.debug('[dupe][replacing]', {
query: req.clean.text,
previous: unique[dupeIndex].source,
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
});
}
// if not preferred over existing, just log and continue
else {
logger.debug('[dupe][skipping]', {
query: req.clean.text,
previous: unique[dupeIndex].source,
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
});
}
// stop iterating when requested size has been reached in unique
if( unique.length >= req.clean.size ){ break; }
}
// replace the original data with only the unique hits
res.data = unique;
next();
}
// return true if the second argument represents a hit which is preferred
// to the hit in the first argument
function isPreferred(existingHit, candidateHit) {
// prefer a record with a postcode
// https://github.com/pelias/api/issues/872
if( !_.has(existingHit, 'address_parts.zip') &&
_.has(candidateHit, 'address_parts.zip') ){ return true; }
// prefer non-canonical sources over canonical ones
if( !_.includes(canonicalSources, candidateHit.source) &&
_.includes(canonicalSources, existingHit.source) ){ return true; }
// prefer certain sources over others
switch( existingHit.source ){
// sources are the same
case candidateHit.source: return false;
// WOF has bbox and is generally preferred
case 'geonames': return candidateHit.source === 'whosonfirst';
// addresses are generally better in OA
case 'openstreetmap': return candidateHit.source === 'openaddresses';
// venues are better in OSM
case 'whosonfirst': return candidateHit.source === 'openstreetmap';
// no preference, keep existing hit
default: return false;
}
}
module.exports = function() {
return dedupeResults;
};