api/middleware/dedupe.js

const logger = require('pelias-logger').get('api');
const _ = require('lodash');
const isDifferent = require('../helper/diffPlaces').isDifferent;
const field = require('../helper/fieldValue');

function dedupeResults(req, res, next) {

  // do nothing if request data is invalid
  if( _.isUndefined(res) || !_.isPlainObject(req.clean) ){ return next(); }

  // do nothing if no result data is invalid
  if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); }

  // loop through data items and only copy unique items to unique
  // note: the first reqults must always be unique!
  let unique = [ res.data[0] ];

  // convenience function to search unique array for an existing element which matches a hit
  let findMatch = (hit) => unique.findIndex(elem => !isDifferent(elem, hit));

  // iterate over res.data using an old-school for loop starting at index 1
  // we can call break at any time to end the iterator
  for( let i=1; i<res.data.length; i++){
    let hit = res.data[i];

    // if there are multiple items in results, loop through them to find a dupe
    // save off the index of the dupe if found
    let dupeIndex = findMatch(hit);

    // if a dupe is not found, just add to list of unique hits and continue
    if( dupeIndex === -1 ){ unique.push(hit); }

    // if dupe was found, we need to check which of the records is preferred
    // since the order in which Elasticsearch returns identical text matches is arbitrary
    // of course, if the new one is preferred we should replace previous with new
    else if( isPreferred( unique[dupeIndex], hit ) ) {

      // replace previous dupe item with current hit
      unique[dupeIndex] = hit;

      // logging
      logger.debug('[dupe][replacing]', {
        query: req.clean.text,
        previous: unique[dupeIndex].source,
        hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
      });
    }

    // if not preferred over existing, just log and continue
    else {
      logger.debug('[dupe][skipping]', {
        query: req.clean.text,
        previous: unique[dupeIndex].source,
        hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
      });
    }

    // stop iterating when requested size has been reached in unique
    if( unique.length >= req.clean.size ){ break; }
  }

  // replace the original data with only the unique hits
  res.data = unique;

  next();
}

// return true if the second argument represents a hit which is preferred
// to the hit in the first argument
function isPreferred(existingHit, candidateHit) {

  // prefer a record with a postcode
  // https://github.com/pelias/api/issues/872
  if( !_.has(existingHit, 'address_parts.zip') &&
       _.has(candidateHit, 'address_parts.zip') ){ return true; }

  // prefer certain sources over others
  switch( existingHit.source ){
    // sources are the same
    case candidateHit.source: return false;
    // WOF has bbox and is generally preferred
    case 'geonames': return candidateHit.source === 'whosonfirst';
    // addresses are generally better in OA
    case 'openstreetmap': return candidateHit.source === 'openaddresses';
    // venues are better in OSM
    case 'whosonfirst': return candidateHit.source === 'openstreetmap';
    // no preference, keep existing hit
    default: return false;
  }
}

module.exports = function() {
  return dedupeResults;
};
support aliases for name fields 7 years ago			`const logger = require('pelias-logger').get('api');`
			`const _ = require('lodash');`
			`const isDifferent = require('../helper/diffPlaces').isDifferent;`
			`const field = require('../helper/fieldValue');`
Add dedupe middleware Dedupe middleware removes __exact__ dupes and truncates the results to the specified size. 9 years ago
			`function dedupeResults(req, res, next) {`

refactor middleware dedupe for readability 6 years ago			`// do nothing if request data is invalid`
			`if( _.isUndefined(res) \|\| !_.isPlainObject(req.clean) ){ return next(); }`

			`// do nothing if no result data is invalid`
			`if( _.isUndefined(res) \|\| !_.isArray(res.data) \|\| _.isEmpty(res.data) ){ return next(); }`

			`// loop through data items and only copy unique items to unique`
			`// note: the first reqults must always be unique!`
			`let unique = [ res.data[0] ];`

			`// convenience function to search unique array for an existing element which matches a hit`
			`let findMatch = (hit) => unique.findIndex(elem => !isDifferent(elem, hit));`

			`// iterate over res.data using an old-school for loop starting at index 1`
			`// we can call break at any time to end the iterator`
			`for( let i=1; i<res.data.length; i++){`
			`let hit = res.data[i];`

			`// if there are multiple items in results, loop through them to find a dupe`
			`// save off the index of the dupe if found`
			`let dupeIndex = findMatch(hit);`

			`// if a dupe is not found, just add to list of unique hits and continue`
			`if( dupeIndex === -1 ){ unique.push(hit); }`

			`// if dupe was found, we need to check which of the records is preferred`
			`// since the order in which Elasticsearch returns identical text matches is arbitrary`
			`// of course, if the new one is preferred we should replace previous with new`
			`else if( isPreferred( unique[dupeIndex], hit ) ) {`
Add dedupe middleware Dedupe middleware removes __exact__ dupes and truncates the results to the specified size. 9 years ago
refactor middleware dedupe for readability 6 years ago			`// replace previous dupe item with current hit`
			`unique[dupeIndex] = hit;`
feat: check for preferred record when dupe found 8 years ago
refactor middleware dedupe for readability 6 years ago			`// logging`
			`logger.debug('[dupe][replacing]', {`
			`query: req.clean.text,`
			`previous: unique[dupeIndex].source,`
			`hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id`
			`});`
Add dedupe middleware Dedupe middleware removes __exact__ dupes and truncates the results to the specified size. 9 years ago			`}`
refactor middleware dedupe for readability 6 years ago
			`// if not preferred over existing, just log and continue`
Add dedupe middleware Dedupe middleware removes __exact__ dupes and truncates the results to the specified size. 9 years ago			`else {`
refactor middleware dedupe for readability 6 years ago			`logger.debug('[dupe][skipping]', {`
			`query: req.clean.text,`
			`previous: unique[dupeIndex].source,`
			`hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id`
feat: check for preferred record when dupe found 8 years ago			`});`
Add dedupe middleware Dedupe middleware removes __exact__ dupes and truncates the results to the specified size. 9 years ago			`}`

refactor middleware dedupe for readability 6 years ago			`// stop iterating when requested size has been reached in unique`
			`if( unique.length >= req.clean.size ){ break; }`
			`}`
Add dedupe middleware Dedupe middleware removes __exact__ dupes and truncates the results to the specified size. 9 years ago
refactor middleware dedupe for readability 6 years ago			`// replace the original data with only the unique hits`
			`res.data = unique;`
Add dedupe middleware Dedupe middleware removes __exact__ dupes and truncates the results to the specified size. 9 years ago
			`next();`
			`}`

refactor middleware dedupe for readability 6 years ago			`// return true if the second argument represents a hit which is preferred`
			`// to the hit in the first argument`
			`function isPreferred(existingHit, candidateHit) {`
Prefer openaddresses results with zip code 8 years ago
refactor middleware dedupe for readability 6 years ago			`// prefer a record with a postcode`
Prefer openaddresses results with zip code 8 years ago			`// https://github.com/pelias/api/issues/872`
refactor middleware dedupe for readability 6 years ago			`if( !_.has(existingHit, 'address_parts.zip') &&`
			`_.has(candidateHit, 'address_parts.zip') ){ return true; }`
Prefer openaddresses results with zip code 8 years ago
refactor middleware dedupe for readability 6 years ago			`// prefer certain sources over others`
			`switch( existingHit.source ){`
			`// sources are the same`
			`case candidateHit.source: return false;`
			`// WOF has bbox and is generally preferred`
			`case 'geonames': return candidateHit.source === 'whosonfirst';`
			`// addresses are generally better in OA`
			`case 'openstreetmap': return candidateHit.source === 'openaddresses';`
			`// venues are better in OSM`
			`case 'whosonfirst': return candidateHit.source === 'openstreetmap';`
			`// no preference, keep existing hit`
			`default: return false;`
			`}`
feat: check for preferred record when dupe found 8 years ago			`}`

refactor middleware dedupe for readability 6 years ago			`module.exports = function() {`
			`return dedupeResults;`
			`};`