From 0e6bf8ed00abdbc82d023cfcaffc2f9d48403f7b Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Thu, 10 Mar 2016 09:07:22 +0200 Subject: [PATCH 1/7] Improve response deduping Consider locality and neighborhood, too. Do not take absence of an attribute as a difference. --- middleware/dedupe.js | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/middleware/dedupe.js b/middleware/dedupe.js index 9e0e5c6a..cd780bdf 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -42,13 +42,17 @@ function isDifferent(item1, item2) { if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) { propMatch(item1.parent, item2.parent, 'region_a'); propMatch(item1.parent, item2.parent, 'country'); + propMatch(item1.parent, item2.parent, 'locality'); + propMatch(item1.parent, item2.parent, 'neighborhood'); } else if (item1.parent !== item2.parent) { throw new Error('different'); } if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) { - propMatch(item1.name, item2.name, 'default'); + for (var lang in item1.name) { + propMatch(item1.name, item2.name, lang); + } } else { propMatch(item1, item2, 'name'); @@ -92,6 +96,10 @@ function propMatch(item1, item2, prop) { prop2= prop2[0]; } + if (!prop1 || !prop2) { + return; // absence of information does not make an item different + } + if (normalizeString(prop1) !== normalizeString(prop2)) { throw new Error('different'); } @@ -111,4 +119,4 @@ function normalizeString(str) { } -module.exports = setup; \ No newline at end of file +module.exports = setup; From 6430930cb5203155bc9e4c5a54ce0b4b103b1916 Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Thu, 10 Mar 2016 09:34:41 +0200 Subject: [PATCH 2/7] Dedupe unit test data for testing missing attributes Test data has two otherwise identical entries, but the latter one lacks some attributes. It should be considered as a duplicate. --- .../fixture/dedupe_elasticsearch_results.js | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/unit/fixture/dedupe_elasticsearch_results.js b/test/unit/fixture/dedupe_elasticsearch_results.js index 84b279d3..b332a29f 100644 --- a/test/unit/fixture/dedupe_elasticsearch_results.js +++ b/test/unit/fixture/dedupe_elasticsearch_results.js @@ -291,5 +291,28 @@ module.exports = [ '_type': 'venue', '_score': 0.9724125, 'confidence': 0.649 + }, + { // same as above, but with some missing attributes. Should be considered as duplicate. + 'center_point': { + 'lon': -81.532392, + 'lat': 40.597578 + }, + 'address': {}, + 'name': { + 'default': 'Strasburg High School' + }, + 'parent': { + 'localadmin': 'Franklin', + 'region_a': 'OH', + 'region': 'Ohio' + }, + 'category': [ + 'education' + ], + '_id': '356646971', + '_type': 'venue', + '_score': 0.9724125, + 'confidence': 0.649 } + ]; From 827788e16c90f72f88ab375c020fd0b81397c3ee Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Thu, 10 Mar 2016 10:01:52 +0200 Subject: [PATCH 3/7] Revert "Dedupe unit test data for testing missing attributes" This reverts commit 6430930cb5203155bc9e4c5a54ce0b4b103b1916. --- .../fixture/dedupe_elasticsearch_results.js | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/test/unit/fixture/dedupe_elasticsearch_results.js b/test/unit/fixture/dedupe_elasticsearch_results.js index b332a29f..84b279d3 100644 --- a/test/unit/fixture/dedupe_elasticsearch_results.js +++ b/test/unit/fixture/dedupe_elasticsearch_results.js @@ -291,28 +291,5 @@ module.exports = [ '_type': 'venue', '_score': 0.9724125, 'confidence': 0.649 - }, - { // same as above, but with some missing attributes. Should be considered as duplicate. - 'center_point': { - 'lon': -81.532392, - 'lat': 40.597578 - }, - 'address': {}, - 'name': { - 'default': 'Strasburg High School' - }, - 'parent': { - 'localadmin': 'Franklin', - 'region_a': 'OH', - 'region': 'Ohio' - }, - 'category': [ - 'education' - ], - '_id': '356646971', - '_type': 'venue', - '_score': 0.9724125, - 'confidence': 0.649 } - ]; From caaee361b2b9e3523897f3754c95ccd1befdf3d4 Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Thu, 10 Mar 2016 10:04:10 +0200 Subject: [PATCH 4/7] Take absence of information as a difference, after all Is '5 main street' same as 'main street'? Probably not. It may be that less detailed data is different data, not just bad data. maybe this can be changed if coordinates are considered, too. --- middleware/dedupe.js | 4 ---- 1 file changed, 4 deletions(-) diff --git a/middleware/dedupe.js b/middleware/dedupe.js index cd780bdf..5f65e54b 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -96,10 +96,6 @@ function propMatch(item1, item2, prop) { prop2= prop2[0]; } - if (!prop1 || !prop2) { - return; // absence of information does not make an item different - } - if (normalizeString(prop1) !== normalizeString(prop2)) { throw new Error('different'); } From 6ab44e0aa21a544e3a091a2b42668aea1f3e3ccc Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Wed, 16 Mar 2016 10:14:20 +0200 Subject: [PATCH 5/7] neighborhood -> neighbourhood --- middleware/dedupe.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/middleware/dedupe.js b/middleware/dedupe.js index 5f65e54b..50557a49 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -43,7 +43,7 @@ function isDifferent(item1, item2) { propMatch(item1.parent, item2.parent, 'region_a'); propMatch(item1.parent, item2.parent, 'country'); propMatch(item1.parent, item2.parent, 'locality'); - propMatch(item1.parent, item2.parent, 'neighborhood'); + propMatch(item1.parent, item2.parent, 'neighbourhood'); } else if (item1.parent !== item2.parent) { throw new Error('different'); From 0064f6f2570b34ab9b56fb2043e06816ee933c5f Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Wed, 16 Mar 2016 10:15:30 +0200 Subject: [PATCH 6/7] Add two new test entries to test neighbourhood and locality in deduping --- .../fixture/dedupe_elasticsearch_results.js | 54 +++++++++++++++++++ test/unit/middleware/dedupe.js | 2 +- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/test/unit/fixture/dedupe_elasticsearch_results.js b/test/unit/fixture/dedupe_elasticsearch_results.js index 84b279d3..ca623e4e 100644 --- a/test/unit/fixture/dedupe_elasticsearch_results.js +++ b/test/unit/fixture/dedupe_elasticsearch_results.js @@ -26,6 +26,60 @@ module.exports = [ '_score': 1.2367082, 'confidence': 0.879 }, + { // same as above, but change the neighbourhood + 'center_point': { + 'lon': -77.207456, + 'lat': 41.039265 + }, + 'address': {}, + 'parent': { + 'localadmin': 'East Lampeter', + 'region_a': 'PA', + 'region': 'Pennsylvania', + 'locality': 'Smoketown', + 'country_a': 'USA', + 'county': 'Lancaster County', + 'country': 'United States', + 'neighbourhood': 'Blueland' // ### + }, + 'name': { + 'default': 'East Lampeter High School' + }, + 'category': [ + 'education' + ], + '_id': '357321757', + '_type': 'venue', + '_score': 1.2367082, + 'confidence': 0.879 + }, + { // same as #1, but change the locality + 'center_point': { + 'lon': -73.207456, + 'lat': 42.039265 + }, + 'address': {}, + 'parent': { + 'localadmin': 'East Lampeter', + 'region_a': 'PA', + 'region': 'Pennsylvania', + 'locality': 'Firetown', // ### + 'country_a': 'USA', + 'county': 'Lancaster County', + 'country': 'United States', + 'neighbourhood': 'Greenland' + }, + 'name': { + 'default': 'East Lampeter High School' + }, + 'category': [ + 'education' + ], + '_id': '357321757', + '_type': 'venue', + '_score': 1.2367082, + 'confidence': 0.879 + }, { 'center_point': { 'lon': -76.207456, diff --git a/test/unit/middleware/dedupe.js b/test/unit/middleware/dedupe.js index 6707d1f9..ad553f9c 100644 --- a/test/unit/middleware/dedupe.js +++ b/test/unit/middleware/dedupe.js @@ -16,7 +16,7 @@ module.exports.tests.dedupe = function(test, common) { data: data }; - var expectedCount = 7; + var expectedCount = 9; dedupe(req, res, function () { t.equal(res.data.length, expectedCount, 'results have fewer items than before'); t.end(); From 00a4bd52a362f952b3fd01bf766ed357c55b15e2 Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Wed, 16 Mar 2016 11:35:50 +0200 Subject: [PATCH 7/7] Bugfix: deduping caused an error if an array property was missing Conversion from array to string should happen independently for the compared properties, not only when both are arrays. --- middleware/dedupe.js | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/middleware/dedupe.js b/middleware/dedupe.js index 50557a49..d130eb7b 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -91,10 +91,8 @@ function propMatch(item1, item2, prop) { // in the case the property is an array (currently only in parent schema) // simply take the 1st item. this will change in the near future to support multiple hierarchies - if (_.isArray(prop1) && _.isArray(prop2)) { - prop1 = prop1[0]; - prop2= prop2[0]; - } + if (_.isArray(prop1)) { prop1 = prop1[0]; } + if (_.isArray(prop2)) { prop2 = prop2[0]; } if (normalizeString(prop1) !== normalizeString(prop2)) { throw new Error('different');