api/middleware/dedupe.js

const logger = require('pelias-logger').get('api');
const _ = require('lodash');
const isDifferent = require('../helper/diffPlaces').isDifferent;
const field = require('../helper/fieldValue');

function setup() {
  return dedupeResults;
}

function dedupeResults(req, res, next) {
  // do nothing if no result data set
  if (_.isUndefined(req.clean) || _.isUndefined(res) || _.isUndefined(res.data)) {
    return next();
  }

  // loop through data items and only copy unique items to uniqueResults
  var uniqueResults = [];

  _.some(res.data, function (hit) {

    if (_.isEmpty(uniqueResults)) {
      uniqueResults.push(hit);
    }
    else {
      // if there are multiple items in results, loop through them to find a dupe
      // save off the index of the dupe if found
      var dupeIndex = uniqueResults.findIndex(function (elem, index, array) {
        return !isDifferent(elem, hit);
      });

      // if a dupe is not found, just add to results and move on
      if (dupeIndex === -1) {
        uniqueResults.push(hit);
      }
      // if dupe was found, we need to check which of the records is preferred
      // since the order in which Elasticsearch returns identical text matches is arbitrary
      // of course, if the new one is preferred we should replace previous with new
      else if (isPreferred(uniqueResults[dupeIndex], hit)) {
        logger.info('[dupe][replacing]', {
          query: req.clean.text,
          previous: uniqueResults[dupeIndex].source,
          hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
        });
        // replace previous dupe item with current hit
        uniqueResults[dupeIndex] = hit;
      }
      // if not preferred over existing, just log and move on
      else {
        logger.info('[dupe][skipping]', {
          query: req.clean.text,
          previous: uniqueResults[dupeIndex].source,
          hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
        });
      }
    }

    // stop looping when requested size has been reached in uniqueResults
    return req.clean.size <= uniqueResults.length;
  });

  res.data = uniqueResults;

  next();
}

function isPreferred(existing, candidateReplacement) {
  // NOTE: we are assuming here that the layer for both records is the same
  const hasZip = _.bind(_.has, null, _.bind.placeholder, 'address_parts.zip');

  // https://github.com/pelias/api/issues/872
  const candidateHasZip = hasZip(candidateReplacement);
  const existingHasZip = hasZip(existing);
  if (candidateHasZip !== existingHasZip) {
    return candidateHasZip;
  }

  //bind the trumps function to the data items to keep the rest of the function clean
  var trumpsFunc = trumps.bind(null, existing, candidateReplacement);

  return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred
         trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA
         trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time
}

function trumps(existing, candidateReplacement, loserSource, winnerSource) {
  return existing.source === loserSource && candidateReplacement.source === winnerSource;
}

module.exports = setup;
support aliases for name fields 7 years ago			`const logger = require('pelias-logger').get('api');`
			`const _ = require('lodash');`
			`const isDifferent = require('../helper/diffPlaces').isDifferent;`
			`const field = require('../helper/fieldValue');`
Add dedupe middleware Dedupe middleware removes __exact__ dupes and truncates the results to the specified size. 9 years ago
			`function setup() {`
			`return dedupeResults;`
			`}`

			`function dedupeResults(req, res, next) {`
			`// do nothing if no result data set`
			`if (_.isUndefined(req.clean) \|\| _.isUndefined(res) \|\| _.isUndefined(res.data)) {`
			`return next();`
			`}`

			`// loop through data items and only copy unique items to uniqueResults`
			`var uniqueResults = [];`

			`_.some(res.data, function (hit) {`
feat: check for preferred record when dupe found 8 years ago
			`if (_.isEmpty(uniqueResults)) {`
Add dedupe middleware Dedupe middleware removes __exact__ dupes and truncates the results to the specified size. 9 years ago			`uniqueResults.push(hit);`
			`}`
			`else {`
feat: check for preferred record when dupe found 8 years ago			`// if there are multiple items in results, loop through them to find a dupe`
			`// save off the index of the dupe if found`
			`var dupeIndex = uniqueResults.findIndex(function (elem, index, array) {`
			`return !isDifferent(elem, hit);`
			`});`

			`// if a dupe is not found, just add to results and move on`
			`if (dupeIndex === -1) {`
			`uniqueResults.push(hit);`
			`}`
			`// if dupe was found, we need to check which of the records is preferred`
			`// since the order in which Elasticsearch returns identical text matches is arbitrary`
			`// of course, if the new one is preferred we should replace previous with new`
			`else if (isPreferred(uniqueResults[dupeIndex], hit)) {`
			`logger.info('[dupe][replacing]', {`
			`query: req.clean.text,`
			`previous: uniqueResults[dupeIndex].source,`
support aliases for name fields 7 years ago			`hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id`
feat: check for preferred record when dupe found 8 years ago			`});`
			`// replace previous dupe item with current hit`
remove splice and replace with direct assignment 8 years ago			`uniqueResults[dupeIndex] = hit;`
feat: check for preferred record when dupe found 8 years ago			`}`
			`// if not preferred over existing, just log and move on`
			`else {`
			`logger.info('[dupe][skipping]', {`
			`query: req.clean.text,`
			`previous: uniqueResults[dupeIndex].source,`
support aliases for name fields 7 years ago			`hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id`
feat: check for preferred record when dupe found 8 years ago			`});`
			`}`
Add dedupe middleware Dedupe middleware removes __exact__ dupes and truncates the results to the specified size. 9 years ago			`}`

			`// stop looping when requested size has been reached in uniqueResults`
			`return req.clean.size <= uniqueResults.length;`
			`});`

			`res.data = uniqueResults;`

			`next();`
			`}`

feat: check for preferred record when dupe found 8 years ago			`function isPreferred(existing, candidateReplacement) {`
			`// NOTE: we are assuming here that the layer for both records is the same`
dedup osm/OA address linked to https://github.com/pelias/pelias/issues/541 and maybe to https://github.com/pelias/pelias/issues/541 but I cannot reproduce it. and to the fact that there are lots of dupplicates in france like: [20 rue hector malot paris](https://mapzen.com/search/explorer/?query=search&text=20%20rue%20hector%20malot%20paris) that returns 1 result from open address and 1 result from OSM now we first check which source has a zipcode and if both have, we prefer OA over OSM 7 years ago			`const hasZip = _.bind(_.has, null, _.bind.placeholder, 'address_parts.zip');`
Prefer openaddresses results with zip code 8 years ago
			`// https://github.com/pelias/api/issues/872`
dedup osm/OA address linked to https://github.com/pelias/pelias/issues/541 and maybe to https://github.com/pelias/pelias/issues/541 but I cannot reproduce it. and to the fact that there are lots of dupplicates in france like: [20 rue hector malot paris](https://mapzen.com/search/explorer/?query=search&text=20%20rue%20hector%20malot%20paris) that returns 1 result from open address and 1 result from OSM now we first check which source has a zipcode and if both have, we prefer OA over OSM 7 years ago			`const candidateHasZip = hasZip(candidateReplacement);`
			`const existingHasZip = hasZip(existing);`
			`if (candidateHasZip !== existingHasZip) {`
			`return candidateHasZip;`
Prefer openaddresses results with zip code 8 years ago			`}`

feat: check for preferred record when dupe found 8 years ago			`//bind the trumps function to the data items to keep the rest of the function clean`
			`var trumpsFunc = trumps.bind(null, existing, candidateReplacement);`

			`return trumpsFunc('geonames', 'whosonfirst') \|\| // WOF has bbox and is generally preferred`
			`trumpsFunc('openstreetmap', 'openaddresses') \|\| // addresses are better in OA`
			`trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time`
			`}`

			`function trumps(existing, candidateReplacement, loserSource, winnerSource) {`
			`return existing.source === loserSource && candidateReplacement.source === winnerSource;`
			`}`

Improve response deduping Consider locality and neighborhood, too. Do not take absence of an attribute as a difference. 9 years ago			`module.exports = setup;`