Browse Source

Add dedupe middleware

Dedupe middleware removes __exact__ dupes and truncates the results
to the specified size.
pull/376/head
Diana Shkolnikov 9 years ago
parent
commit
54187dde67
  1. 4
      controller/search.js
  2. 21
      helper/sizeCalculator.js
  3. 82
      middleware/dedupe.js
  4. 5
      query/autocomplete.js
  5. 5
      query/reverse.js
  6. 8
      query/search.js
  7. 4
      routes/v1.js
  8. 2
      test/unit/fixture/autocomplete_linguistic_focus.js
  9. 2
      test/unit/fixture/autocomplete_linguistic_focus_null_island.js
  10. 2
      test/unit/fixture/autocomplete_linguistic_only.js
  11. 248
      test/unit/fixture/dedupe_elasticsearch_results.js
  12. 2
      test/unit/fixture/search_boundary_country.js
  13. 2
      test/unit/fixture/search_full_address.js
  14. 2
      test/unit/fixture/search_linguistic_bbox.js
  15. 2
      test/unit/fixture/search_linguistic_focus.js
  16. 2
      test/unit/fixture/search_linguistic_focus_bbox.js
  17. 2
      test/unit/fixture/search_linguistic_focus_null_island.js
  18. 2
      test/unit/fixture/search_linguistic_only.js
  19. 2
      test/unit/fixture/search_linguistic_viewport.js
  20. 2
      test/unit/fixture/search_linguistic_viewport_min_diagonal.js
  21. 2
      test/unit/fixture/search_partial_address.js
  22. 2
      test/unit/fixture/search_regions_address.js
  23. 39
      test/unit/helper/sizeCalculator.js
  24. 52
      test/unit/middleware/dedupe.js
  25. 5
      test/unit/query/reverse.js
  26. 2
      test/unit/run.js

4
controller/search.js

@ -1,4 +1,5 @@
var service = { search: require('../service/search') }; var service = { search: require('../service/search') };
var logger = require('pelias-logger').get('api:controller:search');
function setup( backend, query ){ function setup( backend, query ){
@ -14,6 +15,9 @@ function setup( backend, query ){
return next(); return next();
} }
// log clean parameters for stats
logger.info(req.clean);
// backend command // backend command
var cmd = { var cmd = {
index: 'pelias', index: 'pelias',

21
helper/sizeCalculator.js

@ -0,0 +1,21 @@
/**
* Utility for calculating query result size
* incorporating padding for dedupe process
*/
var SIZE_PADDING = 2;
/**
* Add padding or set to 1
*
* @param {number} cleanSize
* @returns {number}
*/
module.exports = function calculateSize(cleanSize) {
switch (cleanSize || 1) {
case 1:
return 1;
default:
return cleanSize * SIZE_PADDING;
}
};

82
middleware/dedupe.js

@ -0,0 +1,82 @@
var util = require('util');
var logger = require('pelias-logger').get('api:middle:dedupe');
var _ = require('lodash');
function setup() {
return dedupeResults;
}
function dedupeResults(req, res, next) {
// do nothing if no result data set
if (_.isUndefined(req.clean) || _.isUndefined(res) || _.isUndefined(res.data)) {
return next();
}
// loop through data items and only copy unique items to uniqueResults
var uniqueResults = [];
_.some(res.data, function (hit) {
if (uniqueResults.length === 0 || _.every(uniqueResults, isDifferent.bind(null, hit)) ) {
uniqueResults.push(hit);
}
else {
logger.info('[dupe]', { query: req.clean.text, hit: hit.name.default });
}
// stop looping when requested size has been reached in uniqueResults
return req.clean.size <= uniqueResults.length;
});
res.data = uniqueResults;
next();
}
/**
* @param {object} item1
* @param {object} item2
* @returns {boolean}
*/
function isDifferent(item1, item2) {
try {
propMatch(item1, item2, 'admin1_abbr');
propMatch(item1, item2, 'alpha3');
if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) {
propMatch(item1.name, item2.name, 'default');
}
else if (item1.name !== item2.name) {
throw 'different';
}
if (item1.hasOwnProperty('address') && item2.hasOwnProperty('address')) {
propMatch(item1.address, item2.address, 'number');
propMatch(item1.address, item2.address, 'street');
propMatch(item1.address, item2.address, 'zip');
}
else if (item1.address !== item2.address) {
throw 'different';
}
}
catch (err) {
return true;
}
return false;
}
/**
* Throw exception if properties are different
*
* @param item1
* @param item2
* @param prop
*/
function propMatch(item1, item2, prop) {
if (item1[prop] !== item2[prop]) {
throw 'different';
}
}
module.exports = setup;

5
query/autocomplete.js

@ -1,7 +1,8 @@
var peliasQuery = require('pelias-query'), var peliasQuery = require('pelias-query'),
defaults = require('./autocomplete_defaults'), defaults = require('./autocomplete_defaults'),
check = require('check-types'); check = require('check-types'),
calcSize = require('../helper/sizeCalculator');
//------------------------------ //------------------------------
// autocomplete query // autocomplete query
@ -31,7 +32,7 @@ function generateQuery( clean ){
vs.var( 'input:name', clean.text ); vs.var( 'input:name', clean.text );
// always 10 (not user definable due to caching) // always 10 (not user definable due to caching)
vs.var( 'size', 10 ); vs.var( 'size', calcSize(10));
// focus point // focus point
if( check.number(clean['focus.point.lat']) && if( check.number(clean['focus.point.lat']) &&

5
query/reverse.js

@ -1,6 +1,7 @@
var peliasQuery = require('pelias-query'), var peliasQuery = require('pelias-query'),
defaults = require('./reverse_defaults'), defaults = require('./reverse_defaults'),
check = require('check-types'); check = require('check-types'),
calcSize = require('../helper/sizeCalculator');
//------------------------------ //------------------------------
// reverse geocode query // reverse geocode query
@ -30,7 +31,7 @@ function generateQuery( clean ){
// set size // set size
if( clean.size ){ if( clean.size ){
vs.var( 'size', clean.size ); vs.var( 'size', calcSize(clean.size));
} }
// focus point to score by distance // focus point to score by distance

8
query/search.js

@ -2,7 +2,8 @@ var peliasQuery = require('pelias-query'),
defaults = require('./search_defaults'), defaults = require('./search_defaults'),
textParser = require('./text_parser'), textParser = require('./text_parser'),
check = require('check-types'), check = require('check-types'),
geolib = require('geolib'); geolib = require('geolib'),
calcSize = require('../helper/sizeCalculator');
//------------------------------ //------------------------------
// general-purpose search query // general-purpose search query
@ -52,9 +53,8 @@ function generateQuery( clean ){
vs.var( 'input:name', clean.text ); vs.var( 'input:name', clean.text );
// size // size
if( clean.size ){ // specify twice as much data as we need so we can filter out dupes
vs.var( 'size', clean.size ); vs.var( 'size', calcSize(clean.size || defaults.size));
}
// focus point // focus point
if( check.number(clean['focus.point.lat']) && if( check.number(clean['focus.point.lat']) &&

4
routes/v1.js

@ -30,6 +30,7 @@ var postProc = {
distances: require('../middleware/distance'), distances: require('../middleware/distance'),
confidenceScores: require('../middleware/confidenceScore'), confidenceScores: require('../middleware/confidenceScore'),
confidenceScoresReverse: require('../middleware/confidenceScoreReverse'), confidenceScoresReverse: require('../middleware/confidenceScoreReverse'),
dedupe: require('../middleware/dedupe'),
localNamingConventions: require('../middleware/localNamingConventions'), localNamingConventions: require('../middleware/localNamingConventions'),
renamePlacenames: require('../middleware/renamePlacenames'), renamePlacenames: require('../middleware/renamePlacenames'),
geocodeJSON: require('../middleware/geocodeJSON'), geocodeJSON: require('../middleware/geocodeJSON'),
@ -61,6 +62,7 @@ function addRoutes(app, peliasConfig) {
controllers.search(), controllers.search(),
postProc.distances('focus.point.'), postProc.distances('focus.point.'),
postProc.confidenceScores(peliasConfig), postProc.confidenceScores(peliasConfig),
postProc.dedupe(),
postProc.localNamingConventions(), postProc.localNamingConventions(),
postProc.renamePlacenames(), postProc.renamePlacenames(),
postProc.geocodeJSON(peliasConfig, base), postProc.geocodeJSON(peliasConfig, base),
@ -72,6 +74,7 @@ function addRoutes(app, peliasConfig) {
controllers.search(null, require('../query/autocomplete')), controllers.search(null, require('../query/autocomplete')),
postProc.distances('focus.point.'), postProc.distances('focus.point.'),
postProc.confidenceScores(peliasConfig), postProc.confidenceScores(peliasConfig),
postProc.dedupe(),
postProc.localNamingConventions(), postProc.localNamingConventions(),
postProc.renamePlacenames(), postProc.renamePlacenames(),
postProc.geocodeJSON(peliasConfig, base), postProc.geocodeJSON(peliasConfig, base),
@ -85,6 +88,7 @@ function addRoutes(app, peliasConfig) {
// reverse confidence scoring depends on distance from origin // reverse confidence scoring depends on distance from origin
// so it must be calculated first // so it must be calculated first
postProc.confidenceScoresReverse(), postProc.confidenceScoresReverse(),
postProc.dedupe(),
postProc.localNamingConventions(), postProc.localNamingConventions(),
postProc.renamePlacenames(), postProc.renamePlacenames(),
postProc.geocodeJSON(peliasConfig, base), postProc.geocodeJSON(peliasConfig, base),

2
test/unit/fixture/autocomplete_linguistic_focus.js

@ -115,6 +115,6 @@ module.exports = {
} }
}, },
'sort': [ '_score' ], 'sort': [ '_score' ],
'size': 10, 'size': 20,
'track_scores': true 'track_scores': true
}; };

2
test/unit/fixture/autocomplete_linguistic_focus_null_island.js

@ -115,6 +115,6 @@ module.exports = {
} }
}, },
'sort': [ '_score' ], 'sort': [ '_score' ],
'size': 10, 'size': 20,
'track_scores': true 'track_scores': true
}; };

2
test/unit/fixture/autocomplete_linguistic_only.js

@ -87,6 +87,6 @@ module.exports = {
} }
}, },
'sort': [ '_score' ], 'sort': [ '_score' ],
'size': 10, 'size': 20,
'track_scores': true 'track_scores': true
}; };

248
test/unit/fixture/dedupe_elasticsearch_results.js

@ -0,0 +1,248 @@
module.exports = [
{
'center_point': {
'lon': -76.207456,
'lat': 40.039265
},
'address': {},
'local_admin': 'East Lampeter',
'admin1_abbr': 'PA',
'name': {
'default': 'East Lampeter High School'
},
'admin1': 'Pennsylvania',
'locality': 'Smoketown',
'alpha3': 'USA',
'admin2': 'Lancaster County',
'admin0': 'United States',
'neighborhood': 'Greenland',
'category': [
'education'
],
'_id': '357321757',
'_type': 'osmnode',
'_score': 1.2367082,
'confidence': 0.879
},
{
'center_point': {
'lon': -76.23246,
'lat': 39.99288
},
'address': {},
'local_admin': 'West Lampeter',
'admin1_abbr': 'PA',
'name': {
'default': 'Lampeter-Strasburg High School'
},
'admin1': 'Pennsylvania',
'locality': 'Lampeter',
'alpha3': 'USA',
'admin2': 'Lancaster County',
'admin0': 'United States',
'neighborhood': 'Wheatland Mills',
'category': [
'education'
],
'_id': '4559068',
'_type': 'geoname',
'_score': 1.2367082,
'confidence': 0.879
},
{
'center_point': {
'lon': -76.20746,
'lat': 40.03927
},
'address': {},
'local_admin': 'East Lampeter',
'admin1_abbr': 'PA',
'name': {
'default': 'East Lampeter High School'
},
'admin1': 'Pennsylvania',
'locality': 'Smoketown',
'alpha3': 'USA',
'admin2': 'Lancaster County',
'admin0': 'United States',
'neighborhood': 'Greenland',
'category': [
'education'
],
'_id': '5187980',
'_type': 'geoname',
'_score': 1.2367082,
'confidence': 0.879
},
{
'center_point': {
'lon': -76.232457,
'lat': 39.992877
},
'address': {},
'local_admin': 'West Lampeter',
'admin1_abbr': 'PA',
'name': {
'default': 'Lampeter-Strasburg High School'
},
'admin1': 'Pennsylvania',
'locality': 'Lampeter',
'alpha3': 'USA',
'admin2': 'Lancaster County',
'admin0': 'United States',
'neighborhood': 'Wheatland Mills',
'category': [
'education'
],
'_id': '357294404',
'_type': 'osmnode',
'_score': 1.2367082,
'confidence': 0.879
},
{
'center_point': {
'lon': -76.207456,
'lat': 40.038987
},
'address': {},
'local_admin': 'East Lampeter',
'admin1_abbr': 'PA',
'name': {
'default': 'East Lampeter School'
},
'admin1': 'Pennsylvania',
'locality': 'Smoketown',
'alpha3': 'USA',
'admin2': 'Lancaster County',
'admin0': 'United States',
'neighborhood': 'Greenland',
'category': [
'education'
],
'_id': '357283977',
'_type': 'osmnode',
'_score': 1.1036991,
'confidence': 0.664
},
{
'center_point': {
'lon': -76.20746,
'lat': 40.03899
},
'address': {},
'local_admin': 'East Lampeter',
'admin1_abbr': 'PA',
'name': {
'default': 'East Lampeter School'
},
'admin1': 'Pennsylvania',
'locality': 'Smoketown',
'alpha3': 'USA',
'admin2': 'Lancaster County',
'admin0': 'United States',
'neighborhood': 'Greenland',
'category': [
'education'
],
'_id': '5187966',
'_type': 'geoname',
'_score': 1.1036991,
'confidence': 0.664
},
{
'center_point': {
'lon': -94.167445,
'lat': 38.762788
},
'address': {},
'local_admin': 'Polk',
'admin1_abbr': 'MO',
'name': {
'default': 'Strasburg School'
},
'admin1': 'Missouri',
'locality': 'Strasburg',
'alpha3': 'USA',
'admin2': 'Cass County',
'admin0': 'United States',
'category': [
'education'
],
'_id': '358058986',
'_type': 'osmnode',
'_score': 1.0492544,
'confidence': 0.658
},
{
'center_point': {
'lon': -78.36317,
'lat': 38.98445
},
'address': {},
'admin1_abbr': 'VA',
'name': {
'default': 'Strasburg High School'
},
'admin1': 'Virginia',
'locality': 'Strasburg',
'alpha3': 'USA',
'admin2': 'Shenandoah County',
'admin0': 'United States',
'neighborhood': 'Strasburg Junction',
'category': [
'education'
],
'_id': '4787978',
'_type': 'geoname',
'_score': 0.9724125,
'confidence': 0.649
},
{
'center_point': {
'lon': -100.16516,
'lat': 46.13427
},
'address': {},
'local_admin': 'Strasburg',
'admin1_abbr': 'ND',
'name': {
'default': 'Strasburg High School'
},
'admin1': 'North Dakota',
'locality': 'Strasburg',
'alpha3': 'USA',
'admin2': 'Emmons County',
'admin0': 'United States',
'category': [
'education'
],
'_id': '9683163',
'_type': 'geoname',
'_score': 0.9724125,
'confidence': 0.649
},
{
'center_point': {
'lon': -81.532392,
'lat': 40.597578
},
'address': {},
'local_admin': 'Franklin',
'admin1_abbr': 'OH',
'name': {
'default': 'Strasburg High School'
},
'admin1': 'Ohio',
'locality': 'Strasburg',
'alpha3': 'USA',
'admin2': 'Tuscarawas County',
'admin0': 'United States',
'category': [
'education'
],
'_id': '356646971',
'_type': 'osmway',
'_score': 0.9724125,
'confidence': 0.649
}
];

2
test/unit/fixture/search_boundary_country.js

@ -97,6 +97,6 @@ module.exports = {
} }
}, },
'sort': [ '_score' ], 'sort': [ '_score' ],
'size': 10, 'size': 20,
'track_scores': true 'track_scores': true
}; };

2
test/unit/fixture/search_full_address.js

@ -178,7 +178,7 @@ module.exports = {
} }
} }
}, },
'size': 10, 'size': 20,
'sort': [ '_score' ], 'sort': [ '_score' ],
'track_scores': true 'track_scores': true
}; };

2
test/unit/fixture/search_linguistic_bbox.js

@ -103,6 +103,6 @@ module.exports = {
} }
}, },
'sort': [ '_score' ], 'sort': [ '_score' ],
'size': 10, 'size': 20,
'track_scores': true 'track_scores': true
}; };

2
test/unit/fixture/search_linguistic_focus.js

@ -117,6 +117,6 @@ module.exports = {
} }
}, },
'sort': [ '_score' ], 'sort': [ '_score' ],
'size': 10, 'size': 20,
'track_scores': true 'track_scores': true
}; };

2
test/unit/fixture/search_linguistic_focus_bbox.js

@ -133,6 +133,6 @@ module.exports = {
} }
}, },
'sort': [ '_score' ], 'sort': [ '_score' ],
'size': 10, 'size': 20,
'track_scores': true 'track_scores': true
}; };

2
test/unit/fixture/search_linguistic_focus_null_island.js

@ -117,6 +117,6 @@ module.exports = {
} }
}, },
'sort': [ '_score' ], 'sort': [ '_score' ],
'size': 10, 'size': 20,
'track_scores': true 'track_scores': true
}; };

2
test/unit/fixture/search_linguistic_only.js

@ -87,6 +87,6 @@ module.exports = {
} }
}, },
'sort': [ '_score' ], 'sort': [ '_score' ],
'size': 10, 'size': 20,
'track_scores': true 'track_scores': true
}; };

2
test/unit/fixture/search_linguistic_viewport.js

@ -128,7 +128,7 @@ module.exports = {
} }
} }
}, },
'size': 10, 'size': 20,
'track_scores': true, 'track_scores': true,
'sort': [ 'sort': [
'_score' '_score'

2
test/unit/fixture/search_linguistic_viewport_min_diagonal.js

@ -128,7 +128,7 @@ module.exports = {
} }
} }
}, },
'size': 10, 'size': 20,
'track_scores': true, 'track_scores': true,
'sort': [ 'sort': [
'_score' '_score'

2
test/unit/fixture/search_partial_address.js

@ -145,7 +145,7 @@ module.exports = {
} }
} }
}, },
'size': 10, 'size': 20,
'sort': [ '_score' ], 'sort': [ '_score' ],
'track_scores': true 'track_scores': true
}; };

2
test/unit/fixture/search_regions_address.js

@ -161,7 +161,7 @@ module.exports = {
} }
} }
}, },
'size': 10, 'size': 20,
'sort': [ '_score' ], 'sort': [ '_score' ],
'track_scores': true 'track_scores': true
}; };

39
test/unit/helper/sizeCalculator.js

@ -0,0 +1,39 @@
var calcSize = require('../../../helper/sizeCalculator.js');
module.exports.tests = {};
module.exports.tests.interface = function(test, common) {
test('interface', function(t) {
t.equal(typeof calcSize, 'function', 'valid function');
t.end();
});
};
module.exports.tests.valid = function(test, common) {
test('size=0', function (t) {
t.equal(calcSize(0), 1);
t.end();
});
test('size=1', function (t) {
t.equal(calcSize(1), 1);
t.end();
});
test('size=10', function (t) {
t.equal(calcSize(10), 20);
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('sizeCalculator: ' + name, testFunction);
}
for( var testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};

52
test/unit/middleware/dedupe.js

@ -0,0 +1,52 @@
var data = require('../fixture/dedupe_elasticsearch_results');
var dedupe = require('../../../middleware/dedupe')();
module.exports.tests = {};
module.exports.tests.dedupe = function(test, common) {
test('filter out duplicates', function(t) {
var req = {
clean: {
text: 'lampeter strasburg high school',
size: 100
}
};
var res = {
data: data
};
var expectedCount = 7;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.end();
});
});
test('truncate results based on specified size', function(t) {
var req = {
clean: {
text: 'lampeter strasburg high school',
size: 3
}
};
var res = {
data: data
};
dedupe(req, res, function () {
t.equal(res.data.length, req.clean.size, 'results have fewer items than before');
t.end();
});
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('[middleware] dedupe: ' + name, testFunction);
}
for( var testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};

5
test/unit/query/reverse.js

@ -80,13 +80,14 @@ module.exports.tests.query = function(test, common) {
test('size fuzz test', function(t) { test('size fuzz test', function(t) {
// test different sizes // test different sizes
var sizes = [1,2,10,undefined,null]; var sizes = [1,2,10,undefined,null];
sizes.forEach( function( size ){ var expectedSizes = [1,4,20,1,1];
sizes.forEach( function( size, index ){
var query = generate({ var query = generate({
'point.lat': 29.49136, 'point.lon': -82.50622, size: size 'point.lat': 29.49136, 'point.lon': -82.50622, size: size
}); });
var compiled = JSON.parse( JSON.stringify( query ) ); var compiled = JSON.parse( JSON.stringify( query ) );
t.equal( compiled.size, size ? size : 1, 'valid reverse query for size: '+ size); t.equal( compiled.size, expectedSizes[index], 'valid reverse query for size: '+ size);
}); });
t.end(); t.end();
}); });

2
test/unit/run.js

@ -12,10 +12,12 @@ var tests = [
require('./helper/text_parser'), require('./helper/text_parser'),
require('./helper/type_mapping'), require('./helper/type_mapping'),
require('./helper/types'), require('./helper/types'),
require('./helper/sizeCalculator'),
require('./middleware/confidenceScore'), require('./middleware/confidenceScore'),
require('./middleware/confidenceScoreReverse'), require('./middleware/confidenceScoreReverse'),
require('./middleware/distance'), require('./middleware/distance'),
require('./middleware/localNamingConventions'), require('./middleware/localNamingConventions'),
require('./middleware/dedupe'),
require('./query/autocomplete'), require('./query/autocomplete'),
require('./query/autocomplete_defaults'), require('./query/autocomplete_defaults'),
require('./query/search_defaults'), require('./query/search_defaults'),

Loading…
Cancel
Save