diff --git a/helper/TypeMapping.js b/helper/TypeMapping.js index 97d87a26..81047fa3 100644 --- a/helper/TypeMapping.js +++ b/helper/TypeMapping.js @@ -26,6 +26,11 @@ var TypeMapping = function(){ */ this.layer_aliases = {}; + /* + * A list of the canonical sources included in the default Pelias configuration + */ + this.canonical_sources = []; + /* * An object that contains all sources or aliases. The key is the source or alias, * the value is either that source, or the canonical name for that alias if it's an alias. @@ -65,6 +70,11 @@ TypeMapping.prototype.setLayerAliases = function( aliases ){ this.layer_aliases = aliases; }; +// canonical sources setter +TypeMapping.prototype.setCanonicalSources = function( sources ){ + this.canonical_sources = sources; +}; + // generate mappings after setters have been run TypeMapping.prototype.generateMappings = function(){ this.sources = Object.keys( this.layers_by_source ); @@ -75,6 +85,17 @@ TypeMapping.prototype.generateMappings = function(){ this.layer_mapping = TypeMapping.addStandardTargetsToAliases(this.layers, this.layer_aliases); }; +// generate a list of all layers which are part of the canonical Pelias configuration +TypeMapping.prototype.getCanonicalLayers = function(){ + var canonicalLayers = []; + for( var source in this.layers_by_source ){ + if( _.includes( this.canonical_sources, source ) ){ + canonicalLayers = _.uniq( canonicalLayers.concat( this.layers_by_source[source] ) ); + } + } + return canonicalLayers; +}; + // load values from targets block TypeMapping.prototype.loadTargets = function( targetsBlock ){ @@ -84,6 +105,7 @@ TypeMapping.prototype.loadTargets = function( targetsBlock ){ this.setSourceAliases( targetsBlock.source_aliases || {} ); this.setLayersBySource( targetsBlock.layers_by_source || {} ); this.setLayerAliases( targetsBlock.layer_aliases || {} ); + this.setCanonicalSources( targetsBlock.canonical_sources || [] ); // generate the mappings this.generateMappings(); diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index 712c7959..92b9ea51 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -1,176 +1,150 @@ -var _ = require('lodash'); -var placeTypes = require('./placeTypes'); +const _ = require('lodash'); +const placeTypes = require('./placeTypes'); +const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers(); +const field = require('../helper/fieldValue'); /** * Compare the layer properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Returns false if the objects are the same, else true. */ -function assertLayerMatch(item1, item2) { - if (item1.layer === item2.layer) { - return false; +function isLayerDifferent(item1, item2){ + if( isPropertyDifferent(item1, item2, 'layer') ){ + // consider all custom layers to be analogous to a venue + if( ( item1.layer === 'venue' || !_.includes( canonicalLayers, item1.layer ) ) && + ( item2.layer === 'venue' || !_.includes( canonicalLayers, item2.layer ) ) ){ + return false; + } + return true; } - - throw new Error('different'); + return false; } /** - * Compare the parent.*_id properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Compare the parent properties if they exist. + * Returns false if the objects are the same, else true. */ -function assertParentHierarchyMatch(item1, item2) { - // if neither object has parent, assume same - if (!item1.hasOwnProperty('parent') && !item2.hasOwnProperty('parent')) { - return false; - } +function isParentHierarchyDifferent(item1, item2){ + let parent1 = _.get(item1, 'parent'); + let parent2 = _.get(item2, 'parent'); - // if both have parent, do the rest of the checking - if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) { - placeTypes.forEach(function (placeType) { - // don't consider its own id - if (placeType === item1.layer) { - return; - } - propMatch(item1.parent, item2.parent, placeType + '_id'); - }); - return false; - } + // check if these are plain 'ol javascript objects + let isPojo1 = _.isPlainObject(parent1); + let isPojo2 = _.isPlainObject(parent2); + + // if neither object has parent info, we consider them the same + if( !isPojo1 && !isPojo2 ){ return false; } + + // if only one has parent info, we consider them the same + // note: this really shouldn't happen as at least on parent should exist + if( !isPojo1 || !isPojo2 ){ return false; } + + // else both have parent info + // iterate over all the placetypes, comparing between items + return placeTypes.some( placeType => { - // if one has parent and the other doesn't consider different - throw new Error('different'); + // skip the parent field corresponding to the item placetype + if( placeType === item1.layer ){ return false; } + + // ensure the parent ids are the same for all placetypes + return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' ); + }); } /** - * Compare the name.* properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Compare the name properties if they exist. + * Returns false if the objects are the same, else true. */ -function assertNameMatch(item1, item2) { - if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) { - for (var lang in item1.name) { - if(item2.name.hasOwnProperty(lang) || lang === 'default') { - // do not consider absence of an additional name as a difference - propMatch(item1.name, item2.name, lang); - } +function isNameDifferent(item1, item2){ + let names1 = _.get(item1, 'name'); + let names2 = _.get(item2, 'name'); + + // check if these are plain 'ol javascript objects + let isPojo1 = _.isPlainObject(names1); + let isPojo2 = _.isPlainObject(names2); + + // if neither object has name info, we consider them the same + if( !isPojo1 && !isPojo2 ){ return false; } + + // if only one has name info, we consider them the same + // note: this really shouldn't happen as name is a mandatory field + if( !isPojo1 || !isPojo2 ){ return false; } + + // else both have name info + // iterate over all the languages in item1, comparing between items + return Object.keys(names1).some( lang => { + + // do not consider absence of an additional name as a difference + // but strictly enfore that 'default' must be present and match + if( _.has(names2, lang) || lang === 'default' ){ + + // do not consider absence of an additional name as a difference + return isPropertyDifferent(names1, names2, lang); } - } - else { - propMatch(item1, item2, 'name'); - } + }); } /** * Compare the address_parts properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Returns false if the objects are the same, else true. */ -function assertAddressMatch(item1, item2) { - // if neither record has address, assume same - if (!item1.hasOwnProperty('address_parts') && !item2.hasOwnProperty('address_parts')) { - return false; - } +function isAddressDifferent(item1, item2){ + let address1 = _.get(item1, 'address_parts'); + let address2 = _.get(item2, 'address_parts'); - // if both have address, check parts - if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) { - propMatch(item1.address_parts, item2.address_parts, 'number'); - propMatch(item1.address_parts, item2.address_parts, 'street'); + // check if these are plain 'ol javascript objects + let isPojo1 = _.isPlainObject(address1); + let isPojo2 = _.isPlainObject(address2); - // only compare zip if both records have it, otherwise just ignore and assume it's the same - // since by this time we've already compared parent hierarchies - if (item1.address_parts.hasOwnProperty('zip') && item2.address_parts.hasOwnProperty('zip')) { - propMatch(item1.address_parts, item2.address_parts, 'zip'); - } + // if neither object has address info, we consider them the same + if( !isPojo1 && !isPojo2 ){ return false; } + + // if only one has address info, we consider them the same + if( !isPojo1 || !isPojo2 ){ return false; } - return false; + // else both have address info + if( isPropertyDifferent(address1, address2, 'number') ){ return true; } + if( isPropertyDifferent(address1, address2, 'street') ){ return true; } + + // only compare zip if both records have it, otherwise just ignore and assume it's the same + // since by this time we've already compared parent hierarchies + if( _.has(address1, 'zip') && _.has(address2, 'zip') ){ + if( isPropertyDifferent(address1, address2, 'zip') ){ return true; } } - // one has address and the other doesn't, different! - throw new Error('different'); + return false; } /** * Compare the two records and return true if they differ and false if same. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} */ -function isDifferent(item1, item2) { - try { - assertLayerMatch(item1, item2); - assertParentHierarchyMatch(item1, item2); - assertNameMatch(item1, item2); - assertAddressMatch(item1, item2); - } - catch (err) { - if (err.message === 'different') { - return true; - } - throw err; - } - +function isDifferent(item1, item2){ + if( isLayerDifferent( item1, item2 ) ){ return true; } + if( isParentHierarchyDifferent( item1, item2 ) ){ return true; } + if( isNameDifferent( item1, item2 ) ){ return true; } + if( isAddressDifferent( item1, item2 ) ){ return true; } return false; } /** - * Throw exception if properties are different - * - * @param {object} item1 - * @param {object} item2 - * @param {string} prop - * @throws {Error} + * return true if properties are different */ -function propMatch(item1, item2, prop) { - var prop1 = item1[prop]; - var prop2 = item2[prop]; +function isPropertyDifferent(item1, item2, prop ){ - // in the case the property is an array (currently only in parent schema) - // simply take the 1st item. this will change in the near future to support multiple hierarchies - if (_.isArray(prop1)) { prop1 = prop1[0]; } - if (_.isArray(prop2)) { prop2 = prop2[0]; } + // if neither item has prop, we consider them the same + if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; } - if (normalizeString(prop1) !== normalizeString(prop2)) { - throw new Error('different'); - } + // handle arrays and other non-string values + var prop1 = field.getStringValue( _.get( item1, prop ) ); + var prop2 = field.getStringValue( _.get( item2, prop ) ); + + // compare strings + return normalizeString(prop1) !== normalizeString(prop2); } /** - * Remove punctuation and lowercase - * - * @param {string} str - * @returns {string} + * lowercase characters and remove some punctuation */ -function normalizeString(str) { - if (!_.isString(str)) { - return str; - } - - if (_.isEmpty(str)) { - return ''; - } - +function normalizeString(str){ return str.toLowerCase().split(/[ ,-]+/).join(' '); } diff --git a/middleware/dedupe.js b/middleware/dedupe.js index 4b637ed5..18054335 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -1,89 +1,99 @@ const logger = require('pelias-logger').get('api'); const _ = require('lodash'); const isDifferent = require('../helper/diffPlaces').isDifferent; +const canonical_sources = require('../helper/type_mapping').canonical_sources; const field = require('../helper/fieldValue'); -function setup() { - return dedupeResults; -} - function dedupeResults(req, res, next) { - // do nothing if no result data set - if (_.isUndefined(req.clean) || _.isUndefined(res) || _.isUndefined(res.data)) { - return next(); - } - // loop through data items and only copy unique items to uniqueResults - var uniqueResults = []; + // do nothing if request data is invalid + if( _.isUndefined(res) || !_.isPlainObject(req.clean) ){ return next(); } + + // do nothing if no result data is invalid + if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); } + + // loop through data items and only copy unique items to unique + // note: the first results must always be unique! + let unique = [ res.data[0] ]; + + // convenience function to search unique array for an existing element which matches a hit + let findMatch = (hit) => unique.findIndex(elem => !isDifferent(elem, hit)); + + // iterate over res.data using an old-school for loop starting at index 1 + // we can call break at any time to end the iterator + for( let i=1; i= req.clean.size ){ break; } + } - res.data = uniqueResults; + // replace the original data with only the unique hits + res.data = unique; next(); } -function isPreferred(existing, candidateReplacement) { - // NOTE: we are assuming here that the layer for both records is the same - const hasZip = _.bind(_.has, null, _.bind.placeholder, 'address_parts.zip'); +// return true if the second argument represents a hit which is preferred +// to the hit in the first argument +function isPreferred(existingHit, candidateHit) { + // prefer a record with a postcode // https://github.com/pelias/api/issues/872 - const candidateHasZip = hasZip(candidateReplacement); - const existingHasZip = hasZip(existing); - if (candidateHasZip !== existingHasZip) { - return candidateHasZip; - } - - //bind the trumps function to the data items to keep the rest of the function clean - var trumpsFunc = trumps.bind(null, existing, candidateReplacement); + if( !_.has(existingHit, 'address_parts.zip') && + _.has(candidateHit, 'address_parts.zip') ){ return true; } - return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred - trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA - trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time -} + // prefer non-canonical sources over canonical ones + if( !_.includes(canonical_sources, candidateHit.source) && + _.includes(canonical_sources, existingHit.source) ){ return true; } -function trumps(existing, candidateReplacement, loserSource, winnerSource) { - return existing.source === loserSource && candidateReplacement.source === winnerSource; + // prefer certain sources over others + switch( existingHit.source ){ + // sources are the same + case candidateHit.source: return false; + // WOF has bbox and is generally preferred + case 'geonames': return candidateHit.source === 'whosonfirst'; + // addresses are generally better in OA + case 'openstreetmap': return candidateHit.source === 'openaddresses'; + // venues are better in OSM + case 'whosonfirst': return candidateHit.source === 'openstreetmap'; + // no preference, keep existing hit + default: return false; + } } -module.exports = setup; +module.exports = function() { + return dedupeResults; +}; diff --git a/package.json b/package.json index 80e5f79d..25104b7a 100644 --- a/package.json +++ b/package.json @@ -52,7 +52,7 @@ "markdown": "^0.5.0", "morgan": "^1.8.2", "pelias-categories": "^1.2.0", - "pelias-config": "^3.0.2", + "pelias-config": "^3.7.0", "pelias-labels": "^1.8.0", "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", diff --git a/test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js b/test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js new file mode 100644 index 00000000..4c1f5f58 --- /dev/null +++ b/test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js @@ -0,0 +1,32 @@ +module.exports = [ + { + '_id': '101914069', + 'layer': 'venue', + 'source': 'openstreetmap', + 'name': { + 'default': 'Nike World Headquarters' + }, + 'parent': { + 'country_a': ['USA'], + 'country': ['United States'], + 'region': ['Oregon'], + 'region_id': ['85688513'] + }, + 'confidence': 0.98 + }, + { + '_id': '2456::trimet::major_employer', + 'layer': 'major_employer', + 'source': 'transit', + 'name': { + 'default': 'Nike World Headquarters' + }, + 'parent': { + 'country_a': ['USA'], + 'country': ['United States'], + 'region': ['Oregon'], + 'region_id': ['85688513'] + }, + 'confidence': 0.50 + } +]; \ No newline at end of file diff --git a/test/unit/fixture/dedupe_only_postalcode_differs.js b/test/unit/fixture/dedupe_only_postalcode_differs.js new file mode 100644 index 00000000..2cbeb4d6 --- /dev/null +++ b/test/unit/fixture/dedupe_only_postalcode_differs.js @@ -0,0 +1,27 @@ +module.exports = [ + { + '_id': '101914069', + 'layer': 'venue', + 'source': 'openstreetmap', + 'name': { + 'default': 'A place' + }, + 'parent': { + 'country_a': ['USA'] + } + }, + { + '_id': '323', + 'layer': 'venue', + 'source': 'openstreetmap', + 'name': { + 'default': 'A place' + }, + 'address_parts': { + 'zip': '97005' + }, + 'parent': { + 'country_a': ['USA'] + } + } +]; diff --git a/test/unit/middleware/dedupe.js b/test/unit/middleware/dedupe.js index 9b4d04c3..63cb3373 100644 --- a/test/unit/middleware/dedupe.js +++ b/test/unit/middleware/dedupe.js @@ -1,5 +1,7 @@ var data = require('../fixture/dedupe_elasticsearch_results'); var nonAsciiData = require('../fixture/dedupe_elasticsearch_nonascii_results'); +var customLayerData = require('../fixture/dedupe_elasticsearch_custom_layer_results'); +var onlyPostalcodeDiffersData = require('../fixture/dedupe_only_postalcode_differs'); var dedupe = require('../../../middleware/dedupe')(); module.exports.tests = {}; @@ -56,10 +58,47 @@ module.exports.tests.dedupe = function(test, common) { t.end(); }); }); + + test('deduplicate between custom layers and venue layers', function(t) { + var req = { + clean: { + size: 20 + } + }; + var res = { + data: customLayerData + }; + var expected = customLayerData[1]; // non-canonical record + + dedupe(req, res, function () { + t.equal(res.data.length, 1, 'only one result displayed'); + t.equal(res.data[0], expected, 'non-canonical data is preferred'); + t.end(); + }); + }); + + test('test records with no address except one has postalcode', function(t) { + var req = { + clean: { + size: 20 + } + }; + var res = { + data: onlyPostalcodeDiffersData + }; + var expected = onlyPostalcodeDiffersData[1]; // record with postcode + + dedupe(req, res, function () { + t.equal(res.data.length, 1, 'only one result displayed'); + t.equal(res.data[0], expected, 'record with postalcode is preferred'); + t.end(); + }); + }); }; -module.exports.tests.trump = function(test, common) { - test('whosonfirst trumps geonames, replace', function (t) { + +module.exports.tests.priority = function(test, common) { + test('whosonfirst takes priority over geonames, replace', function (t) { var req = { clean: { text: 'Lancaster', @@ -91,7 +130,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('whosonfirst trumps geonames, no replace', function (t) { + test('whosonfirst takes priority over geonames, no replace', function (t) { var req = { clean: { text: 'Lancaster', @@ -123,7 +162,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('openstreetmap trumps whosonfirst venues', function (t) { + test('openstreetmap takes priority over whosonfirst venues', function (t) { var req = { clean: { text: 'Lancaster Dairy Farm', @@ -155,7 +194,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('openaddresses trumps openstreetmap', function (t) { + test('openaddresses takes priority over openstreetmap', function (t) { var req = { clean: { text: '100 Main St', @@ -187,7 +226,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('openaddresses with zip trumps openaddresses without zip', function (t) { + test('openaddresses with zip takes priority over openaddresses without zip', function (t) { var req = { clean: { text: '100 Main St', @@ -223,7 +262,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('osm with zip trumps openaddresses without zip', function (t) { + test('osm with zip takes priority over openaddresses without zip', function (t) { var req = { clean: { text: '100 Main St',