mirror of https://github.com/pelias/api.git
Julian Simioni
6 years ago
committed by
GitHub
7 changed files with 307 additions and 203 deletions
@ -1,176 +1,150 @@
|
||||
var _ = require('lodash'); |
||||
var placeTypes = require('./placeTypes'); |
||||
const _ = require('lodash'); |
||||
const placeTypes = require('./placeTypes'); |
||||
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers(); |
||||
const field = require('../helper/fieldValue'); |
||||
|
||||
/** |
||||
* Compare the layer properties if they exist. |
||||
* Returns false if the objects are the same, and throws |
||||
* an exception with the message 'different' if not. |
||||
* |
||||
* @param {object} item1 |
||||
* @param {object} item2 |
||||
* @returns {boolean} |
||||
* @throws {Error} |
||||
* Returns false if the objects are the same, else true. |
||||
*/ |
||||
function assertLayerMatch(item1, item2) { |
||||
if (item1.layer === item2.layer) { |
||||
return false; |
||||
function isLayerDifferent(item1, item2){ |
||||
if( isPropertyDifferent(item1, item2, 'layer') ){ |
||||
// consider all custom layers to be analogous to a venue
|
||||
if( ( item1.layer === 'venue' || !_.includes( canonicalLayers, item1.layer ) ) && |
||||
( item2.layer === 'venue' || !_.includes( canonicalLayers, item2.layer ) ) ){ |
||||
return false; |
||||
} |
||||
return true; |
||||
} |
||||
|
||||
throw new Error('different'); |
||||
return false; |
||||
} |
||||
|
||||
/** |
||||
* Compare the parent.*_id properties if they exist. |
||||
* Returns false if the objects are the same, and throws |
||||
* an exception with the message 'different' if not. |
||||
* |
||||
* @param {object} item1 |
||||
* @param {object} item2 |
||||
* @returns {boolean} |
||||
* @throws {Error} |
||||
* Compare the parent properties if they exist. |
||||
* Returns false if the objects are the same, else true. |
||||
*/ |
||||
function assertParentHierarchyMatch(item1, item2) { |
||||
// if neither object has parent, assume same
|
||||
if (!item1.hasOwnProperty('parent') && !item2.hasOwnProperty('parent')) { |
||||
return false; |
||||
} |
||||
function isParentHierarchyDifferent(item1, item2){ |
||||
let parent1 = _.get(item1, 'parent'); |
||||
let parent2 = _.get(item2, 'parent'); |
||||
|
||||
// if both have parent, do the rest of the checking
|
||||
if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) { |
||||
placeTypes.forEach(function (placeType) { |
||||
// don't consider its own id
|
||||
if (placeType === item1.layer) { |
||||
return; |
||||
} |
||||
propMatch(item1.parent, item2.parent, placeType + '_id'); |
||||
}); |
||||
return false; |
||||
} |
||||
// check if these are plain 'ol javascript objects
|
||||
let isPojo1 = _.isPlainObject(parent1); |
||||
let isPojo2 = _.isPlainObject(parent2); |
||||
|
||||
// if neither object has parent info, we consider them the same
|
||||
if( !isPojo1 && !isPojo2 ){ return false; } |
||||
|
||||
// if only one has parent info, we consider them the same
|
||||
// note: this really shouldn't happen as at least on parent should exist
|
||||
if( !isPojo1 || !isPojo2 ){ return false; } |
||||
|
||||
// else both have parent info
|
||||
// iterate over all the placetypes, comparing between items
|
||||
return placeTypes.some( placeType => { |
||||
|
||||
// if one has parent and the other doesn't consider different
|
||||
throw new Error('different'); |
||||
// skip the parent field corresponding to the item placetype
|
||||
if( placeType === item1.layer ){ return false; } |
||||
|
||||
// ensure the parent ids are the same for all placetypes
|
||||
return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' ); |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Compare the name.* properties if they exist. |
||||
* Returns false if the objects are the same, and throws |
||||
* an exception with the message 'different' if not. |
||||
* |
||||
* @param {object} item1 |
||||
* @param {object} item2 |
||||
* @returns {boolean} |
||||
* @throws {Error} |
||||
* Compare the name properties if they exist. |
||||
* Returns false if the objects are the same, else true. |
||||
*/ |
||||
function assertNameMatch(item1, item2) { |
||||
if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) { |
||||
for (var lang in item1.name) { |
||||
if(item2.name.hasOwnProperty(lang) || lang === 'default') { |
||||
// do not consider absence of an additional name as a difference
|
||||
propMatch(item1.name, item2.name, lang); |
||||
} |
||||
function isNameDifferent(item1, item2){ |
||||
let names1 = _.get(item1, 'name'); |
||||
let names2 = _.get(item2, 'name'); |
||||
|
||||
// check if these are plain 'ol javascript objects
|
||||
let isPojo1 = _.isPlainObject(names1); |
||||
let isPojo2 = _.isPlainObject(names2); |
||||
|
||||
// if neither object has name info, we consider them the same
|
||||
if( !isPojo1 && !isPojo2 ){ return false; } |
||||
|
||||
// if only one has name info, we consider them the same
|
||||
// note: this really shouldn't happen as name is a mandatory field
|
||||
if( !isPojo1 || !isPojo2 ){ return false; } |
||||
|
||||
// else both have name info
|
||||
// iterate over all the languages in item1, comparing between items
|
||||
return Object.keys(names1).some( lang => { |
||||
|
||||
// do not consider absence of an additional name as a difference
|
||||
// but strictly enfore that 'default' must be present and match
|
||||
if( _.has(names2, lang) || lang === 'default' ){ |
||||
|
||||
// do not consider absence of an additional name as a difference
|
||||
return isPropertyDifferent(names1, names2, lang); |
||||
} |
||||
} |
||||
else { |
||||
propMatch(item1, item2, 'name'); |
||||
} |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Compare the address_parts properties if they exist. |
||||
* Returns false if the objects are the same, and throws |
||||
* an exception with the message 'different' if not. |
||||
* |
||||
* @param {object} item1 |
||||
* @param {object} item2 |
||||
* @returns {boolean} |
||||
* @throws {Error} |
||||
* Returns false if the objects are the same, else true. |
||||
*/ |
||||
function assertAddressMatch(item1, item2) { |
||||
// if neither record has address, assume same
|
||||
if (!item1.hasOwnProperty('address_parts') && !item2.hasOwnProperty('address_parts')) { |
||||
return false; |
||||
} |
||||
function isAddressDifferent(item1, item2){ |
||||
let address1 = _.get(item1, 'address_parts'); |
||||
let address2 = _.get(item2, 'address_parts'); |
||||
|
||||
// if both have address, check parts
|
||||
if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) { |
||||
propMatch(item1.address_parts, item2.address_parts, 'number'); |
||||
propMatch(item1.address_parts, item2.address_parts, 'street'); |
||||
// check if these are plain 'ol javascript objects
|
||||
let isPojo1 = _.isPlainObject(address1); |
||||
let isPojo2 = _.isPlainObject(address2); |
||||
|
||||
// only compare zip if both records have it, otherwise just ignore and assume it's the same
|
||||
// since by this time we've already compared parent hierarchies
|
||||
if (item1.address_parts.hasOwnProperty('zip') && item2.address_parts.hasOwnProperty('zip')) { |
||||
propMatch(item1.address_parts, item2.address_parts, 'zip'); |
||||
} |
||||
// if neither object has address info, we consider them the same
|
||||
if( !isPojo1 && !isPojo2 ){ return false; } |
||||
|
||||
// if only one has address info, we consider them the same
|
||||
if( !isPojo1 || !isPojo2 ){ return false; } |
||||
|
||||
return false; |
||||
// else both have address info
|
||||
if( isPropertyDifferent(address1, address2, 'number') ){ return true; } |
||||
if( isPropertyDifferent(address1, address2, 'street') ){ return true; } |
||||
|
||||
// only compare zip if both records have it, otherwise just ignore and assume it's the same
|
||||
// since by this time we've already compared parent hierarchies
|
||||
if( _.has(address1, 'zip') && _.has(address2, 'zip') ){ |
||||
if( isPropertyDifferent(address1, address2, 'zip') ){ return true; } |
||||
} |
||||
|
||||
// one has address and the other doesn't, different!
|
||||
throw new Error('different'); |
||||
return false; |
||||
} |
||||
|
||||
/** |
||||
* Compare the two records and return true if they differ and false if same. |
||||
* |
||||
* @param {object} item1 |
||||
* @param {object} item2 |
||||
* @returns {boolean} |
||||
* @throws {Error} |
||||
*/ |
||||
function isDifferent(item1, item2) { |
||||
try { |
||||
assertLayerMatch(item1, item2); |
||||
assertParentHierarchyMatch(item1, item2); |
||||
assertNameMatch(item1, item2); |
||||
assertAddressMatch(item1, item2); |
||||
} |
||||
catch (err) { |
||||
if (err.message === 'different') { |
||||
return true; |
||||
} |
||||
throw err; |
||||
} |
||||
|
||||
function isDifferent(item1, item2){ |
||||
if( isLayerDifferent( item1, item2 ) ){ return true; } |
||||
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; } |
||||
if( isNameDifferent( item1, item2 ) ){ return true; } |
||||
if( isAddressDifferent( item1, item2 ) ){ return true; } |
||||
return false; |
||||
} |
||||
|
||||
/** |
||||
* Throw exception if properties are different |
||||
* |
||||
* @param {object} item1 |
||||
* @param {object} item2 |
||||
* @param {string} prop |
||||
* @throws {Error} |
||||
* return true if properties are different |
||||
*/ |
||||
function propMatch(item1, item2, prop) { |
||||
var prop1 = item1[prop]; |
||||
var prop2 = item2[prop]; |
||||
function isPropertyDifferent(item1, item2, prop ){ |
||||
|
||||
// in the case the property is an array (currently only in parent schema)
|
||||
// simply take the 1st item. this will change in the near future to support multiple hierarchies
|
||||
if (_.isArray(prop1)) { prop1 = prop1[0]; } |
||||
if (_.isArray(prop2)) { prop2 = prop2[0]; } |
||||
// if neither item has prop, we consider them the same
|
||||
if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; } |
||||
|
||||
if (normalizeString(prop1) !== normalizeString(prop2)) { |
||||
throw new Error('different'); |
||||
} |
||||
// handle arrays and other non-string values
|
||||
var prop1 = field.getStringValue( _.get( item1, prop ) ); |
||||
var prop2 = field.getStringValue( _.get( item2, prop ) ); |
||||
|
||||
// compare strings
|
||||
return normalizeString(prop1) !== normalizeString(prop2); |
||||
} |
||||
|
||||
/** |
||||
* Remove punctuation and lowercase |
||||
* |
||||
* @param {string} str |
||||
* @returns {string} |
||||
* lowercase characters and remove some punctuation |
||||
*/ |
||||
function normalizeString(str) { |
||||
if (!_.isString(str)) { |
||||
return str; |
||||
} |
||||
|
||||
if (_.isEmpty(str)) { |
||||
return ''; |
||||
} |
||||
|
||||
function normalizeString(str){ |
||||
return str.toLowerCase().split(/[ ,-]+/).join(' '); |
||||
} |
||||
|
||||
|
@ -1,89 +1,99 @@
|
||||
const logger = require('pelias-logger').get('api'); |
||||
const _ = require('lodash'); |
||||
const isDifferent = require('../helper/diffPlaces').isDifferent; |
||||
const canonical_sources = require('../helper/type_mapping').canonical_sources; |
||||
const field = require('../helper/fieldValue'); |
||||
|
||||
function setup() { |
||||
return dedupeResults; |
||||
} |
||||
|
||||
function dedupeResults(req, res, next) { |
||||
// do nothing if no result data set
|
||||
if (_.isUndefined(req.clean) || _.isUndefined(res) || _.isUndefined(res.data)) { |
||||
return next(); |
||||
} |
||||
|
||||
// loop through data items and only copy unique items to uniqueResults
|
||||
var uniqueResults = []; |
||||
// do nothing if request data is invalid
|
||||
if( _.isUndefined(res) || !_.isPlainObject(req.clean) ){ return next(); } |
||||
|
||||
// do nothing if no result data is invalid
|
||||
if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); } |
||||
|
||||
// loop through data items and only copy unique items to unique
|
||||
// note: the first results must always be unique!
|
||||
let unique = [ res.data[0] ]; |
||||
|
||||
// convenience function to search unique array for an existing element which matches a hit
|
||||
let findMatch = (hit) => unique.findIndex(elem => !isDifferent(elem, hit)); |
||||
|
||||
// iterate over res.data using an old-school for loop starting at index 1
|
||||
// we can call break at any time to end the iterator
|
||||
for( let i=1; i<res.data.length; i++){ |
||||
let hit = res.data[i]; |
||||
|
||||
// if there are multiple items in results, loop through them to find a dupe
|
||||
// save off the index of the dupe if found
|
||||
let dupeIndex = findMatch(hit); |
||||
|
||||
// if a dupe is not found, just add to list of unique hits and continue
|
||||
if( dupeIndex === -1 ){ unique.push(hit); } |
||||
|
||||
_.some(res.data, function (hit) { |
||||
// if dupe was found, we need to check which of the records is preferred
|
||||
// since the order in which Elasticsearch returns identical text matches is arbitrary
|
||||
// of course, if the new one is preferred we should replace previous with new
|
||||
else if( isPreferred( unique[dupeIndex], hit ) ) { |
||||
|
||||
if (_.isEmpty(uniqueResults)) { |
||||
uniqueResults.push(hit); |
||||
// replace previous dupe item with current hit
|
||||
unique[dupeIndex] = hit; |
||||
|
||||
// logging
|
||||
logger.debug('[dupe][replacing]', { |
||||
query: req.clean.text, |
||||
previous: unique[dupeIndex].source, |
||||
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id |
||||
}); |
||||
} |
||||
|
||||
// if not preferred over existing, just log and continue
|
||||
else { |
||||
// if there are multiple items in results, loop through them to find a dupe
|
||||
// save off the index of the dupe if found
|
||||
var dupeIndex = uniqueResults.findIndex(function (elem, index, array) { |
||||
return !isDifferent(elem, hit); |
||||
logger.debug('[dupe][skipping]', { |
||||
query: req.clean.text, |
||||
previous: unique[dupeIndex].source, |
||||
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id |
||||
}); |
||||
|
||||
// if a dupe is not found, just add to results and move on
|
||||
if (dupeIndex === -1) { |
||||
uniqueResults.push(hit); |
||||
} |
||||
// if dupe was found, we need to check which of the records is preferred
|
||||
// since the order in which Elasticsearch returns identical text matches is arbitrary
|
||||
// of course, if the new one is preferred we should replace previous with new
|
||||
else if (isPreferred(uniqueResults[dupeIndex], hit)) { |
||||
logger.debug('[dupe][replacing]', { |
||||
query: req.clean.text, |
||||
previous: uniqueResults[dupeIndex].source, |
||||
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id |
||||
}); |
||||
// replace previous dupe item with current hit
|
||||
uniqueResults[dupeIndex] = hit; |
||||
} |
||||
// if not preferred over existing, just log and move on
|
||||
else { |
||||
logger.debug('[dupe][skipping]', { |
||||
query: req.clean.text, |
||||
previous: uniqueResults[dupeIndex].source, |
||||
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id |
||||
}); |
||||
} |
||||
} |
||||
|
||||
// stop looping when requested size has been reached in uniqueResults
|
||||
return req.clean.size <= uniqueResults.length; |
||||
}); |
||||
// stop iterating when requested size has been reached in unique
|
||||
if( unique.length >= req.clean.size ){ break; } |
||||
} |
||||
|
||||
res.data = uniqueResults; |
||||
// replace the original data with only the unique hits
|
||||
res.data = unique; |
||||
|
||||
next(); |
||||
} |
||||
|
||||
function isPreferred(existing, candidateReplacement) { |
||||
// NOTE: we are assuming here that the layer for both records is the same
|
||||
const hasZip = _.bind(_.has, null, _.bind.placeholder, 'address_parts.zip'); |
||||
// return true if the second argument represents a hit which is preferred
|
||||
// to the hit in the first argument
|
||||
function isPreferred(existingHit, candidateHit) { |
||||
|
||||
// prefer a record with a postcode
|
||||
// https://github.com/pelias/api/issues/872
|
||||
const candidateHasZip = hasZip(candidateReplacement); |
||||
const existingHasZip = hasZip(existing); |
||||
if (candidateHasZip !== existingHasZip) { |
||||
return candidateHasZip; |
||||
} |
||||
|
||||
//bind the trumps function to the data items to keep the rest of the function clean
|
||||
var trumpsFunc = trumps.bind(null, existing, candidateReplacement); |
||||
if( !_.has(existingHit, 'address_parts.zip') && |
||||
_.has(candidateHit, 'address_parts.zip') ){ return true; } |
||||
|
||||
return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred
|
||||
trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA
|
||||
trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time
|
||||
} |
||||
// prefer non-canonical sources over canonical ones
|
||||
if( !_.includes(canonical_sources, candidateHit.source) && |
||||
_.includes(canonical_sources, existingHit.source) ){ return true; } |
||||
|
||||
function trumps(existing, candidateReplacement, loserSource, winnerSource) { |
||||
return existing.source === loserSource && candidateReplacement.source === winnerSource; |
||||
// prefer certain sources over others
|
||||
switch( existingHit.source ){ |
||||
// sources are the same
|
||||
case candidateHit.source: return false; |
||||
// WOF has bbox and is generally preferred
|
||||
case 'geonames': return candidateHit.source === 'whosonfirst'; |
||||
// addresses are generally better in OA
|
||||
case 'openstreetmap': return candidateHit.source === 'openaddresses'; |
||||
// venues are better in OSM
|
||||
case 'whosonfirst': return candidateHit.source === 'openstreetmap'; |
||||
// no preference, keep existing hit
|
||||
default: return false; |
||||
} |
||||
} |
||||
|
||||
module.exports = setup; |
||||
module.exports = function() { |
||||
return dedupeResults; |
||||
}; |
||||
|
@ -0,0 +1,32 @@
|
||||
module.exports = [ |
||||
{ |
||||
'_id': '101914069', |
||||
'layer': 'venue', |
||||
'source': 'openstreetmap', |
||||
'name': { |
||||
'default': 'Nike World Headquarters' |
||||
}, |
||||
'parent': { |
||||
'country_a': ['USA'], |
||||
'country': ['United States'], |
||||
'region': ['Oregon'], |
||||
'region_id': ['85688513'] |
||||
}, |
||||
'confidence': 0.98 |
||||
}, |
||||
{ |
||||
'_id': '2456::trimet::major_employer', |
||||
'layer': 'major_employer', |
||||
'source': 'transit', |
||||
'name': { |
||||
'default': 'Nike World Headquarters' |
||||
}, |
||||
'parent': { |
||||
'country_a': ['USA'], |
||||
'country': ['United States'], |
||||
'region': ['Oregon'], |
||||
'region_id': ['85688513'] |
||||
}, |
||||
'confidence': 0.50 |
||||
} |
||||
]; |
@ -0,0 +1,27 @@
|
||||
module.exports = [ |
||||
{ |
||||
'_id': '101914069', |
||||
'layer': 'venue', |
||||
'source': 'openstreetmap', |
||||
'name': { |
||||
'default': 'A place' |
||||
}, |
||||
'parent': { |
||||
'country_a': ['USA'] |
||||
} |
||||
}, |
||||
{ |
||||
'_id': '323', |
||||
'layer': 'venue', |
||||
'source': 'openstreetmap', |
||||
'name': { |
||||
'default': 'A place' |
||||
}, |
||||
'address_parts': { |
||||
'zip': '97005' |
||||
}, |
||||
'parent': { |
||||
'country_a': ['USA'] |
||||
} |
||||
} |
||||
]; |
Loading…
Reference in new issue