mirror of https://github.com/pelias/api.git
Diana Shkolnikov
8 years ago
4 changed files with 450 additions and 2 deletions
@ -0,0 +1,255 @@
|
||||
/** |
||||
* |
||||
* Basic confidence score should be computed and returned for each item in the results. |
||||
* The score should range between 0-1, and take into consideration as many factors as possible. |
||||
* |
||||
* Some factors to consider: |
||||
* |
||||
* - number of results from ES |
||||
* - fallback status (aka layer match between expected and actual) |
||||
*/ |
||||
|
||||
var check = require('check-types'); |
||||
|
||||
function setup() { |
||||
return computeScores; |
||||
} |
||||
|
||||
function computeScores(req, res, next) { |
||||
// do nothing if no result data set or if the query is not of the fallback variety
|
||||
// later add disambiguation to this list
|
||||
if (check.undefined(req.clean) || check.undefined(res) || |
||||
check.undefined(res.data) || check.undefined(res.meta) || |
||||
res.meta.query_type !== 'fallback') { |
||||
return next(); |
||||
} |
||||
|
||||
// loop through data items and determine confidence scores
|
||||
res.data = res.data.map(computeConfidenceScore.bind(null, req)); |
||||
|
||||
next(); |
||||
} |
||||
|
||||
/** |
||||
* Check all types of things to determine how confident we are that this result |
||||
* is correct. |
||||
* |
||||
* @param {object} req |
||||
* @param {object} hit |
||||
* @returns {object} |
||||
*/ |
||||
function computeConfidenceScore(req, hit) { |
||||
var dealBreakers = checkForDealBreakers(req, hit); |
||||
if (dealBreakers) { |
||||
hit.confidence = 0.5; |
||||
return hit; |
||||
} |
||||
|
||||
var checkCount = 3; |
||||
hit.confidence = 0; |
||||
|
||||
if (RELATIVE_SCORES) { |
||||
checkCount += 2; |
||||
hit.confidence += checkDistanceFromMean(hit._score, mean, stdev); |
||||
hit.confidence += computeZScore(hit._score, mean, stdev); |
||||
} |
||||
hit.confidence += checkName(req.clean.text, req.clean.parsed_text, hit); |
||||
hit.confidence += checkQueryType(req.clean.parsed_text, hit); |
||||
hit.confidence += checkAddress(req.clean.parsed_text, hit); |
||||
|
||||
// TODO: look at categories and location
|
||||
|
||||
hit.confidence /= checkCount; |
||||
hit.confidence = Number((hit.confidence).toFixed(3)); |
||||
|
||||
return hit; |
||||
} |
||||
|
||||
/* |
||||
* Check for clearly mismatching properties in a result |
||||
* zip code and state (region) are currently checked if present |
||||
* |
||||
* @param {object|undefined} text |
||||
* @param {object} hit |
||||
* @returns {bool} |
||||
*/ |
||||
function checkForDealBreakers(req, hit) { |
||||
if (check.undefined(req.clean.parsed_text)) { |
||||
return false; |
||||
} |
||||
|
||||
if (check.assigned(req.clean.parsed_text.state) && hit.parent.region_a && req.clean.parsed_text.state !== hit.parent.region_a[0]) { |
||||
logger.debug('[confidence][deal-breaker]: state !== region_a'); |
||||
return true; |
||||
} |
||||
|
||||
if (check.assigned(req.clean.parsed_text.postalcode) && check.assigned(hit.address_parts) && |
||||
req.clean.parsed_text.postalcode !== hit.address_parts.zip) { |
||||
return true; |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Check how statistically significant the score of this result is |
||||
* given mean and standard deviation |
||||
* |
||||
* @param {number} score |
||||
* @param {number} mean |
||||
* @param {number} stdev |
||||
* @returns {number} |
||||
*/ |
||||
function checkDistanceFromMean(score, mean, stdev) { |
||||
return (score - mean) > stdev ? 1 : 0; |
||||
} |
||||
|
||||
/** |
||||
* Compare text string or name component of parsed_text against |
||||
* default name in result |
||||
* |
||||
* @param {string} text |
||||
* @param {object|undefined} parsed_text |
||||
* @param {object} hit |
||||
* @returns {number} |
||||
*/ |
||||
function checkName(text, parsed_text, hit) { |
||||
// parsed_text name should take precedence if available since it's the cleaner name property
|
||||
if (check.assigned(parsed_text) && check.assigned(parsed_text.name) && |
||||
hit.name.default.toLowerCase() === parsed_text.name.toLowerCase()) { |
||||
return 1; |
||||
} |
||||
|
||||
// if no parsed_text check the text value as provided against result's default name
|
||||
if (hit.name.default.toLowerCase() === text.toLowerCase()) { |
||||
return 1; |
||||
} |
||||
|
||||
// if no matches detected, don't judge too harshly since it was a longshot anyway
|
||||
return 0.7; |
||||
} |
||||
|
||||
/** |
||||
* text being set indicates the query was for an address |
||||
* check if house number was specified and found in result |
||||
* |
||||
* @param {object|undefined} text |
||||
* @param {object} hit |
||||
* @returns {number} |
||||
*/ |
||||
function checkQueryType(text, hit) { |
||||
if (check.assigned(text) && check.assigned(text.number) && |
||||
(check.undefined(hit.address_parts) || |
||||
(check.assigned(hit.address_parts) && check.undefined(hit.address_parts.number)))) { |
||||
return 0; |
||||
} |
||||
return 1; |
||||
} |
||||
|
||||
/** |
||||
* Determine the quality of the property match |
||||
* |
||||
* @param {string|number|undefined|null} textProp |
||||
* @param {string|number|undefined|null} hitProp |
||||
* @param {boolean} expectEnriched |
||||
* @returns {number} |
||||
*/ |
||||
function propMatch(textProp, hitProp, expectEnriched) { |
||||
|
||||
// both missing, but expect to have enriched value in result => BAD
|
||||
if (check.undefined(textProp) && check.undefined(hitProp) && check.assigned(expectEnriched)) { return 0; } |
||||
|
||||
// both missing, and no enrichment expected => GOOD
|
||||
if (check.undefined(textProp) && check.undefined(hitProp)) { return 1; } |
||||
|
||||
// text has it, result doesn't => BAD
|
||||
if (check.assigned(textProp) && check.undefined(hitProp)) { return 0; } |
||||
|
||||
// text missing, result has it, and enrichment is expected => GOOD
|
||||
if (check.undefined(textProp) && check.assigned(hitProp) && check.assigned(expectEnriched)) { return 1; } |
||||
|
||||
// text missing, result has it, enrichment not desired => 50/50
|
||||
if (check.undefined(textProp) && check.assigned(hitProp)) { return 0.5; } |
||||
|
||||
// both present, values match => GREAT
|
||||
if (check.assigned(textProp) && check.assigned(hitProp) && |
||||
textProp.toString().toLowerCase() === hitProp.toString().toLowerCase()) { return 1; } |
||||
|
||||
// ¯\_(ツ)_/¯
|
||||
return 0.7; |
||||
} |
||||
|
||||
/** |
||||
* Check various parts of the parsed text address |
||||
* against the results |
||||
* |
||||
* @param {object} text |
||||
* @param {string|number} [text.number] |
||||
* @param {string} [text.street] |
||||
* @param {string} [text.postalcode] |
||||
* @param {string} [text.state] |
||||
* @param {string} [text.country] |
||||
* @param {object} hit |
||||
* @param {object} [hit.address_parts] |
||||
* @param {string|number} [hit.address_parts.number] |
||||
* @param {string} [hit.address_parts.street] |
||||
* @param {string|number} [hit.address_parts.zip] |
||||
* @param {Array} [hit.parent.region_a] |
||||
* @param {Array} [hit.parent.country_a] |
||||
* @returns {number} |
||||
*/ |
||||
function checkAddress(text, hit) { |
||||
var checkCount = 5; |
||||
var res = 0; |
||||
|
||||
if (check.assigned(text) && check.assigned(text.number) && check.assigned(text.street)) { |
||||
res += propMatch(text.number, (hit.address_parts ? hit.address_parts.number : null), false); |
||||
res += propMatch(text.street, (hit.address_parts ? hit.address_parts.street : null), false); |
||||
res += propMatch(text.postalcode, (hit.address_parts ? hit.address_parts.zip: null), true); |
||||
res += propMatch(text.state, (hit.parent.region_a ? hit.parent.region_a[0] : null), true); |
||||
res += propMatch(text.country, (hit.parent.country_a ? hit.parent.country_a[0] :null), true); |
||||
|
||||
res /= checkCount; |
||||
} |
||||
else { |
||||
res = 1; |
||||
} |
||||
|
||||
return res; |
||||
} |
||||
|
||||
/** |
||||
* z-scores have an effective range of -3.00 to +3.00. |
||||
* An average z-score is ZERO. |
||||
* A negative z-score indicates that the item/element is below |
||||
* average and a positive z-score means that the item/element |
||||
* in above average. When teachers say they are going to "curve" |
||||
* the test, they do this by computing z-scores for the students' test scores. |
||||
* |
||||
* @param {number} score |
||||
* @param {number} mean |
||||
* @param {number} stdev |
||||
* @returns {number} |
||||
*/ |
||||
function computeZScore(score, mean, stdev) { |
||||
if (stdev < 0.01) { |
||||
return 0; |
||||
} |
||||
// because the effective range of z-scores is -3.00 to +3.00
|
||||
// add 10 to ensure a positive value, and then divide by 10+3+3
|
||||
// to further normalize to %-like result
|
||||
return (((score - mean) / (stdev)) + 10) / 16; |
||||
} |
||||
|
||||
/** |
||||
* Computes standard deviation given an array of values |
||||
* |
||||
* @param {Array} scores |
||||
* @returns {number} |
||||
*/ |
||||
function computeStandardDeviation(scores) { |
||||
var stdev = stats.stdev(scores); |
||||
// if stdev is low, just consider it 0
|
||||
return (stdev < 0.01) ? 0 : stdev; |
||||
} |
||||
|
||||
|
||||
module.exports = setup; |
@ -0,0 +1,182 @@
|
||||
var confidenceScore = require('../../../middleware/confidenceScore')(); |
||||
|
||||
module.exports.tests = {}; |
||||
|
||||
module.exports.tests.confidenceScore = function(test, common) { |
||||
|
||||
test('empty res and req should not throw exception', function(t) { |
||||
function testIt() { |
||||
confidenceScore({}, {}, function() {}); |
||||
} |
||||
|
||||
t.doesNotThrow(testIt, 'an exception should not have been thrown'); |
||||
t.end(); |
||||
}); |
||||
|
||||
test('res.results without parsed_text should not throw exception', function(t) { |
||||
var req = {}; |
||||
var res = { |
||||
data: [{ |
||||
name: 'foo' |
||||
}], |
||||
meta: [10] |
||||
}; |
||||
|
||||
function testIt() { |
||||
confidenceScore(req, res, function() {}); |
||||
} |
||||
|
||||
t.doesNotThrow(testIt, 'an exception should not have been thrown'); |
||||
t.end(); |
||||
}); |
||||
|
||||
test('hit without address should not error', function(t) { |
||||
var req = { |
||||
clean: { |
||||
text: 'test name3', |
||||
parsed_text: { |
||||
postalcode: 12345 |
||||
} |
||||
} |
||||
}; |
||||
var res = { |
||||
data: [{ |
||||
name: { |
||||
default: 'foo' |
||||
} |
||||
}], |
||||
meta: { |
||||
scores: [10], |
||||
query_type: 'original' |
||||
} |
||||
}; |
||||
|
||||
function testIt() { |
||||
confidenceScore(req, res, function() {}); |
||||
} |
||||
|
||||
t.doesNotThrow(testIt, 'an exception should not have been thrown with no address'); |
||||
t.end(); |
||||
}); |
||||
|
||||
|
||||
test('res.results without parsed_text should not throw exception', function(t) { |
||||
var req = { |
||||
clean: { text: 'test name1' } |
||||
}; |
||||
var res = { |
||||
data: [{ |
||||
_score: 10, |
||||
found: true, |
||||
value: 1, |
||||
center_point: { lat: 100.1, lon: -50.5 }, |
||||
name: { default: 'test name1' }, |
||||
parent: { |
||||
country: ['country1'], |
||||
region: ['state1'], |
||||
county: ['city1'] |
||||
} |
||||
}, { |
||||
_score: 20, |
||||
value: 2, |
||||
center_point: { lat: 100.2, lon: -51.5 }, |
||||
name: { default: 'test name2' }, |
||||
parent: { |
||||
country: ['country2'], |
||||
region: ['state2'], |
||||
county: ['city2'] |
||||
} |
||||
}], |
||||
meta: { |
||||
scores: [10], |
||||
query_type: 'original' |
||||
} |
||||
}; |
||||
|
||||
confidenceScore(req, res, function() {}); |
||||
t.equal(res.data[0].confidence, 0.6, 'score was set'); |
||||
t.end(); |
||||
}); |
||||
|
||||
test('undefined region fields should be handled gracefully', function(t) { |
||||
var req = { |
||||
clean: { |
||||
text: '123 Main St, City, NM', |
||||
parsed_text: { |
||||
number: 123, |
||||
street: 'Main St', |
||||
state: 'NM' |
||||
} |
||||
} |
||||
}; |
||||
var res = { |
||||
data: [{ |
||||
_score: 10, |
||||
found: true, |
||||
value: 1, |
||||
center_point: { lat: 100.1, lon: -50.5 }, |
||||
name: { default: 'test name1' }, |
||||
parent: { |
||||
country: ['country1'], |
||||
region: undefined, |
||||
region_a: undefined, |
||||
county: ['city1'] |
||||
} |
||||
}], |
||||
meta: { |
||||
scores: [10], |
||||
query_type: 'original' |
||||
} |
||||
}; |
||||
|
||||
confidenceScore(req, res, function() {}); |
||||
t.equal(res.data[0].confidence, 0.28, 'score was set'); |
||||
t.end(); |
||||
}); |
||||
|
||||
test('should only work for original query_type', function(t) { |
||||
var req = { |
||||
clean: { |
||||
text: '123 Main St, City, NM', |
||||
parsed_text: { |
||||
number: 123, |
||||
street: 'Main St', |
||||
state: 'NM' |
||||
} |
||||
} |
||||
}; |
||||
var res = { |
||||
data: [{ |
||||
_score: 10, |
||||
found: true, |
||||
value: 1, |
||||
center_point: { lat: 100.1, lon: -50.5 }, |
||||
name: { default: 'test name1' }, |
||||
parent: { |
||||
country: ['country1'], |
||||
region: undefined, |
||||
region_a: undefined, |
||||
county: ['city1'] |
||||
} |
||||
}], |
||||
meta: { |
||||
scores: [10], |
||||
query_type: 'fallback' |
||||
} |
||||
}; |
||||
|
||||
confidenceScore(req, res, function() {}); |
||||
t.false(res.data[0].hasOwnProperty('confidence'), 'score was not set'); |
||||
t.end(); |
||||
}); |
||||
}; |
||||
|
||||
module.exports.all = function (tape, common) { |
||||
function test(name, testFunction) { |
||||
return tape('[middleware] confidenceScore: ' + name, testFunction); |
||||
} |
||||
|
||||
for( var testCase in module.exports.tests ){ |
||||
module.exports.tests[testCase](test, common); |
||||
} |
||||
}; |
Loading…
Reference in new issue