Browse Source

Refactor deduper and write additional tests

pull/666/head
Diana Shkolnikov 8 years ago committed by Stephen Hess
parent
commit
a8e82b018d
  1. 172
      helper/diffPlaces.js
  2. 88
      middleware/dedupe.js
  3. 1261
      test/unit/fixture/dedupe_elasticsearch_results.js
  4. 180
      test/unit/helper/diffPlaces.js
  5. 2
      test/unit/middleware/dedupe.js
  6. 1
      test/unit/run.js

172
helper/diffPlaces.js

@ -0,0 +1,172 @@
var _ = require('lodash');
var placeTypes = require('../helper/placeTypes');
/**
* Compare the layer properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/
function isDiffLayer(item1, item2) {
if (item1.layer === item2.layer) {
return false;
}
throw new Error('different');
}
/**
* Compare the parent.* properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/
function isDiffParentHierarchy(item1, item2) {
// if neither object has parent, assume same
if (!item1.hasOwnProperty('parent') && !item2.hasOwnProperty('parent')) {
return false;
}
// if both have parent, do the rest of the checking
if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) {
placeTypes.forEach(function (placeType) {
// don't consider its own id
if (placeType === item1.layer) {
return;
}
propMatch(item1.parent, item2.parent, placeType + '_id');
});
return false;
}
// if one has parent and the other doesn't consider different
throw new Error('different');
}
/**
* Compare the name.* properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/
function isDiffName(item1, item2) {
if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) {
for (var lang in item1.name) {
if(item2.name[lang] || lang === 'default') {
// do not consider absence of an additional name as a difference
propMatch(item1.name, item2.name, lang);
}
}
}
else {
propMatch(item1, item2, 'name');
}
}
/**
* Compare the address_parts properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/
function isDiffAddress(item1, item2) {
// if neither record has address, assume same
if (!item1.hasOwnProperty('address_parts') && !item2.hasOwnProperty('address_parts')) {
return false;
}
// if both have address, check parts
if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) {
propMatch(item1.address_parts, item2.address_parts, 'number');
propMatch(item1.address_parts, item2.address_parts, 'street');
// only compare zip if both records have it, otherwise just ignore and assume it's the same
// since by this time we've already compared parent hierarchies
if (item1.address_parts.hasOwnProperty('zip') && item2.address_parts.hasOwnProperty('zip')) {
propMatch(item1.address_parts, item2.address_parts, 'zip');
}
return false;
}
// one has address and the other doesn't, different!
throw new Error('different');
}
/**
* Compare the two records and return true if they differ and false if same.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/
function isDifferent(item1, item2) {
try {
isDiffLayer(item1, item2);
isDiffParentHierarchy(item1, item2);
isDiffName(item1, item2);
isDiffAddress(item1, item2);
}
catch (err) {
if (err.message === 'different') {
return true;
}
throw err;
}
return false;
}
/**
* Throw exception if properties are different
*
* @param {object} item1
* @param {object} item2
* @param {string} prop
* @throws {Error}
*/
function propMatch(item1, item2, prop) {
var prop1 = item1[prop];
var prop2 = item2[prop];
// in the case the property is an array (currently only in parent schema)
// simply take the 1st item. this will change in the near future to support multiple hierarchies
if (_.isArray(prop1)) { prop1 = prop1[0]; }
if (_.isArray(prop2)) { prop2 = prop2[0]; }
if (normalizeString(prop1) !== normalizeString(prop2)) {
throw new Error('different');
}
}
/**
* Remove punctuation and lowercase
*
* @param {string} str
* @returns {string}
*/
function normalizeString(str) {
if (!str) {
return '';
}
return str.toLowerCase().split(/[ ,-]+/).join(' ');
}
module.exports.isDifferent = isDifferent;

88
middleware/dedupe.js

@ -1,5 +1,6 @@
var logger = require('pelias-logger').get('api'); var logger = require('pelias-logger').get('api');
var _ = require('lodash'); var _ = require('lodash');
var isDifferent = require('../helper/diffPlaces').isDifferent;
function setup() { function setup() {
return dedupeResults; return dedupeResults;
@ -19,7 +20,7 @@ function dedupeResults(req, res, next) {
uniqueResults.push(hit); uniqueResults.push(hit);
} }
else { else {
logger.info('[dupe]', { query: req.clean.text, hit: hit.name.default }); logger.info('[dupe]', { query: req.clean.text, hit: hit.name.default + ' ' + hit.source + ':' + hit._id });
} }
// stop looping when requested size has been reached in uniqueResults // stop looping when requested size has been reached in uniqueResults
@ -31,89 +32,4 @@ function dedupeResults(req, res, next) {
next(); next();
} }
/**
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/
function isDifferent(item1, item2) {
try {
if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) {
propMatch(item1.parent, item2.parent, 'region_a');
propMatch(item1.parent, item2.parent, 'country');
propMatch(item1.parent, item2.parent, 'locality');
propMatch(item1.parent, item2.parent, 'neighbourhood');
}
else if (item1.parent !== item2.parent) {
throw new Error('different');
}
if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) {
for (var lang in item1.name) {
if(item2.name[lang] || lang === 'default') {
// do not consider absence of an additional name as a difference
propMatch(item1.name, item2.name, lang);
}
}
}
else {
propMatch(item1, item2, 'name');
}
if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) {
propMatch(item1.address_parts, item2.address_parts, 'number');
propMatch(item1.address_parts, item2.address_parts, 'street');
propMatch(item1.address_parts, item2.address_parts, 'zip');
}
else if (item1.address_parts !== item2.address_parts) {
throw new Error('different');
}
}
catch (err) {
if (err.message === 'different') {
return true;
}
throw err;
}
return false;
}
/**
* Throw exception if properties are different
*
* @param {object} item1
* @param {object} item2
* @param {string} prop
* @throws {Error}
*/
function propMatch(item1, item2, prop) {
var prop1 = item1[prop];
var prop2 = item2[prop];
// in the case the property is an array (currently only in parent schema)
// simply take the 1st item. this will change in the near future to support multiple hierarchies
if (_.isArray(prop1)) { prop1 = prop1[0]; }
if (_.isArray(prop2)) { prop2 = prop2[0]; }
if (normalizeString(prop1) !== normalizeString(prop2)) {
throw new Error('different');
}
}
/**
* Remove punctuation and lowercase
*
* @param {string} str
* @returns {string}
*/
function normalizeString(str) {
if (!str) {
return '';
}
return str.toLowerCase().split(/[ ,-]+/).join(' ');
}
module.exports = setup; module.exports = setup;

1261
test/unit/fixture/dedupe_elasticsearch_results.js

File diff suppressed because it is too large Load Diff

180
test/unit/helper/diffPlaces.js

@ -0,0 +1,180 @@
var isDifferent= require('../../../helper/diffPlaces').isDifferent;
module.exports.tests = {};
module.exports.tests.dedupe = function(test, common) {
test('match same object', function(t) {
var item1 = {
'parent': {
'country': [ 'United States' ],
'county': [ 'Otsego County' ],
'region_a': [ 'NY' ],
'localadmin': [ 'Cherry Valley' ],
'county_id': [ '102082399' ],
'localadmin_id': [ '404522887' ],
'country_a': [ 'USA' ],
'region_id': [ '85688543' ],
'locality': [ 'Cherry Valley' ],
'locality_id': [ '85978799' ],
'region': [ 'New York' ],
'country_id': [ '85633793' ]
},
'name': {
'default': '1 Main Street'
},
'address_parts': {
'number': '1',
'street': 'Main Street'
},
'layer': 'address'
};
t.false(isDifferent(item1, item1), 'should be the same');
t.end();
});
test('catch diff layers', function(t) {
var item1 = { 'layer': 'address' };
var item2 = { 'layer': 'venue' };
t.true(isDifferent(item1, item2), 'should be different');
t.end();
});
test('catch diff parent', function(t) {
var item1 = {
'layer': 'same',
'parent': {
'country_id': '12345'
}
};
var item2 = {
'layer': 'same',
'parent': {
'country_id': '54321'
}
};
t.true(isDifferent(item1, item2), 'should be different');
t.end();
});
test('catch diff name', function(t) {
var item1 = {
'name': {
'default': '1 Main St'
}
};
var item2 = {
'name': {
'default': '1 Broad St'
}
};
t.true(isDifferent(item1, item2), 'should be different');
t.end();
});
test('match diff capitalization in name', function(t) {
var item1 = {
'name': {
'default': '1 MAIN ST'
}
};
var item2 = {
'name': {
'default': '1 Main St'
}
};
t.false(isDifferent(item1, item2), 'should be the same');
t.end();
});
test('do not handle expansions', function(t) {
// we currently don't handle expansions and abbreviations and
// this is a test waiting to be updated as soon as we fix it
var item1 = {
'name': {
'default': '1 Main Street'
}
};
var item2 = {
'name': {
'default': '1 Main St'
}
};
t.true(isDifferent(item1, item2), 'should be different');
t.end();
});
test('missing names in other langs should not be a diff', function(t) {
var item1 = {
'name': {
'default': 'Moscow',
'rus': 'Москва'
}
};
var item2 = {
'name': {
'default': 'Moscow'
}
};
t.false(isDifferent(item1, item2), 'should be the same');
t.end();
});
test('catch diff address', function(t) {
var item1 = {
'address_parts': {
'number': '1',
'street': 'Main Street',
'zip': '90210'
}
};
var item2 = {
'address_parts': {
'number': '2',
'street': 'Main Street',
'zip': '90210'
}
};
t.true(isDifferent(item1, item2), 'should be different');
t.end();
});
test('catch diff address', function(t) {
var item1 = {
'address_parts': {
'number': '1',
'street': 'Main Street',
'zip': '90210'
}
};
var item2 = {
'address_parts': {
'number': '1',
'street': 'Main Street'
}
};
t.false(isDifferent(item1, item2), 'should be the same');
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('[helper] diffPlaces: ' + name, testFunction);
}
for( var testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};

2
test/unit/middleware/dedupe.js

@ -16,7 +16,7 @@ module.exports.tests.dedupe = function(test, common) {
data: data data: data
}; };
var expectedCount = 9; var expectedCount = 8;
dedupe(req, res, function () { dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before'); t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.end(); t.end();

1
test/unit/run.js

@ -12,6 +12,7 @@ var tests = [
require('./controller/index'), require('./controller/index'),
require('./controller/place'), require('./controller/place'),
require('./controller/search'), require('./controller/search'),
require('./helper/diffPlaces'),
require('./helper/geojsonify'), require('./helper/geojsonify'),
require('./helper/labelGenerator_examples'), require('./helper/labelGenerator_examples'),
require('./helper/labelGenerator_default'), require('./helper/labelGenerator_default'),

Loading…
Cancel
Save