Browse Source

feat(dedupe): improved matching across languages

dedupe
Peter Johnson 6 years ago
parent
commit
6b28707663
  1. 42
      helper/diffPlaces.js
  2. 52
      test/unit/helper/diffPlaces.js
  3. 18
      test/unit/middleware/dedupe.js

42
helper/diffPlaces.js

@ -70,17 +70,20 @@ function isNameDifferent(item1, item2){
if( !isPojo1 || !isPojo2 ){ return false; }
// else both have name info
// iterate over all the languages in item1, comparing between items
return Object.keys(names1).some( lang => {
// do not consider absence of an additional name as a difference
// but strictly enfore that 'default' must be present and match
if( _.has(names2, lang) || lang === 'default' ){
// iterate over all the languages in item2, comparing them to the
// 'default' name of item1
for( let lang in names2 ){
if( !isPropertyDifferent({[lang]: names1.default}, names2, lang) ){ return false; }
}
// do not consider absence of an additional name as a difference
return isPropertyDifferent(names1, names2, lang);
// iterate over all the languages in item1, comparing them to the
// 'default' name of item2
for( let lang in names1 ){
if( !isPropertyDifferent({[lang]: names2.default}, names1, lang) ){ return false; }
}
});
return true;
}
/**
@ -133,12 +136,25 @@ function isPropertyDifferent(item1, item2, prop ){
// if neither item has prop, we consider them the same
if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; }
// handle arrays and other non-string values
var prop1 = field.getStringValue( _.get( item1, prop ) );
var prop2 = field.getStringValue( _.get( item2, prop ) );
// read property values, casting scalar values to arrays
var prop1 = field.getArrayValue( _.get( item1, prop ) );
var prop2 = field.getArrayValue( _.get( item2, prop ) );
// iterate over all properties in both sets, comparing each
// item in turn, return false on first match.
// handles non-string values
for( let i=0; i<prop1.length; i++ ){
let prop1StringValue = field.getStringValue( prop1[i] );
for( let j=0; j<prop2.length; j++ ){
let prop2StringValue = field.getStringValue( prop2[j] );
if( normalizeString( prop1StringValue ) === normalizeString( prop2StringValue ) ){
return false;
}
}
}
// compare strings
return normalizeString(prop1) !== normalizeString(prop2);
// we did not find any matching values, consider them different
return true;
}
/**

52
test/unit/helper/diffPlaces.js

@ -128,6 +128,50 @@ module.exports.tests.dedupe = function(test, common) {
t.end();
});
test('improved matching across languages - if default name is the same, consider this a match', function(t) {
var item1 = {
'name': {
'default': 'Bern',
'eng': 'Bern',
'deu': 'Kanton Bern',
'fra': 'Berne'
}
};
var item2 = {
'name': {
'default': 'Bern',
'eng': 'Berne',
'deu': 'Bundesstadt', // note: this is wrong, see: https://github.com/whosonfirst-data/whosonfirst-data/issues/1363
'fra': 'Berne'
}
};
t.false(isDifferent(item1, item2), 'should be the same');
t.end();
});
test('improved matching across languages - default names differ but match another language', function(t) {
var item1 = {
'name': {
'default': 'Berne',
'eng': 'Bern',
'deu': 'Kanton Bern',
'fra': 'Berne'
}
};
var item2 = {
'name': {
'default': 'Bern',
'eng': 'Berne',
'deu': 'Bundesstadt',
'fra': 'Berne'
}
};
t.false(isDifferent(item1, item2), 'should be the same');
t.end();
});
test('catch diff address', function(t) {
var item1 = {
'address_parts': {
@ -167,6 +211,14 @@ module.exports.tests.dedupe = function(test, common) {
t.end();
});
test('completely empty objects', function(t) {
var item1 = {};
var item2 = {};
t.false(isDifferent(item1, item2), 'should be the same');
t.end();
});
test('works with name aliases', function(t) {
var item1 = {
'name': {

18
test/unit/middleware/dedupe.js

@ -94,6 +94,24 @@ module.exports.tests.dedupe = function(test, common) {
t.end();
});
});
test('test records with no address except one has postalcode', function(t) {
var req = {
clean: {
size: 20
}
};
var res = {
data: onlyPostalcodeDiffersData
};
var expected = onlyPostalcodeDiffersData[1]; // record with postcode
dedupe(req, res, function () {
t.equal(res.data.length, 1, 'only one result displayed');
t.equal(res.data[0], expected, 'record with postalcode is preferred');
t.end();
});
});
};

Loading…
Cancel
Save