diff --git a/middleware/confidenceScoreFallback.js b/middleware/confidenceScoreFallback.js index cead1a35..8593ed60 100644 --- a/middleware/confidenceScoreFallback.js +++ b/middleware/confidenceScoreFallback.js @@ -10,6 +10,7 @@ */ var check = require('check-types'); +var logger = require('pelias-logger').get('api-confidence'); function setup() { return computeScores; @@ -39,217 +40,79 @@ function computeScores(req, res, next) { * @returns {object} */ function computeConfidenceScore(req, hit) { - var dealBreakers = checkForDealBreakers(req, hit); - if (dealBreakers) { - hit.confidence = 0.5; + + // if parsed text doesn't exist, which it never should, just assign a low confidence and move on + if (!req.clean.hasOwnProperty('parsed_text')) { + hit.confidence = 0.1; + hit.match_type = 'unknown'; return hit; } - var checkCount = 3; - hit.confidence = 0; - - if (RELATIVE_SCORES) { - checkCount += 2; - hit.confidence += checkDistanceFromMean(hit._score, mean, stdev); - hit.confidence += computeZScore(hit._score, mean, stdev); - } - hit.confidence += checkName(req.clean.text, req.clean.parsed_text, hit); - hit.confidence += checkQueryType(req.clean.parsed_text, hit); - hit.confidence += checkAddress(req.clean.parsed_text, hit); + // start with a confidence level of 1 because we trust ES queries to be accurate + hit.confidence = 1.0; - // TODO: look at categories and location + // in the case of fallback there might be deductions + hit.confidence *= checkFallbackLevel(req, hit); - hit.confidence /= checkCount; + // truncate the precision hit.confidence = Number((hit.confidence).toFixed(3)); return hit; } -/* - * Check for clearly mismatching properties in a result - * zip code and state (region) are currently checked if present - * - * @param {object|undefined} text - * @param {object} hit - * @returns {bool} - */ -function checkForDealBreakers(req, hit) { - if (check.undefined(req.clean.parsed_text)) { - return false; - } - - if (check.assigned(req.clean.parsed_text.state) && hit.parent.region_a && req.clean.parsed_text.state !== hit.parent.region_a[0]) { - logger.debug('[confidence][deal-breaker]: state !== region_a'); - return true; +function checkFallbackLevel(req, hit) { + if (checkFallbackOccurred(req, hit)) { + hit.match_type = 'fallback'; + + // if we know a fallback occurred, deduct points based on layer granularity + switch (hit.layer) { + case 'venue': + case 'address': + logger.warn('Fallback scenarios should not result in address or venue records!', req.clean.parsed_text); + return 0.8; + case 'street': + return 0.8; + case 'locality': + case 'borough': + case 'neighbourhood': + return 0.6; + case 'macrocounty': + case 'county': + case 'localadmin': + return 0.4; + case 'region': + return 0.3; + case 'country': + case 'dependency': + case 'macroregion': + return 0.1; + default: + return 0.1; + } } - if (check.assigned(req.clean.parsed_text.postalcode) && check.assigned(hit.address_parts) && - req.clean.parsed_text.postalcode !== hit.address_parts.zip) { - return true; - } + hit.match_type = 'exact'; + return 1.0; } -/** - * Check how statistically significant the score of this result is - * given mean and standard deviation - * - * @param {number} score - * @param {number} mean - * @param {number} stdev - * @returns {number} - */ -function checkDistanceFromMean(score, mean, stdev) { - return (score - mean) > stdev ? 1 : 0; -} +function checkFallbackOccurred(req, hit) { + // at this time we only do this for address queries, so keep this simple + // TODO: add other layer checks once we start handling disambiguation -/** - * Compare text string or name component of parsed_text against - * default name in result - * - * @param {string} text - * @param {object|undefined} parsed_text - * @param {object} hit - * @returns {number} - */ -function checkName(text, parsed_text, hit) { - // parsed_text name should take precedence if available since it's the cleaner name property - if (check.assigned(parsed_text) && check.assigned(parsed_text.name) && - hit.name.default.toLowerCase() === parsed_text.name.toLowerCase()) { - return 1; - } - - // if no parsed_text check the text value as provided against result's default name - if (hit.name.default.toLowerCase() === text.toLowerCase()) { - return 1; - } - - // if no matches detected, don't judge too harshly since it was a longshot anyway - return 0.7; + return (requestedAddress(req) && hit.layer !== 'address') || + (requestedStreet(req) && hit.layer !== 'street'); } -/** - * text being set indicates the query was for an address - * check if house number was specified and found in result - * - * @param {object|undefined} text - * @param {object} hit - * @returns {number} - */ -function checkQueryType(text, hit) { - if (check.assigned(text) && check.assigned(text.number) && - (check.undefined(hit.address_parts) || - (check.assigned(hit.address_parts) && check.undefined(hit.address_parts.number)))) { - return 0; - } - return 1; +function requestedAddress(req) { + // house number and street name were specified + return req.clean.parsed_text.hasOwnProperty('number') && + req.clean.parsed_text.hasOwnProperty('street'); } -/** - * Determine the quality of the property match - * - * @param {string|number|undefined|null} textProp - * @param {string|number|undefined|null} hitProp - * @param {boolean} expectEnriched - * @returns {number} - */ -function propMatch(textProp, hitProp, expectEnriched) { - - // both missing, but expect to have enriched value in result => BAD - if (check.undefined(textProp) && check.undefined(hitProp) && check.assigned(expectEnriched)) { return 0; } - - // both missing, and no enrichment expected => GOOD - if (check.undefined(textProp) && check.undefined(hitProp)) { return 1; } - - // text has it, result doesn't => BAD - if (check.assigned(textProp) && check.undefined(hitProp)) { return 0; } - - // text missing, result has it, and enrichment is expected => GOOD - if (check.undefined(textProp) && check.assigned(hitProp) && check.assigned(expectEnriched)) { return 1; } - - // text missing, result has it, enrichment not desired => 50/50 - if (check.undefined(textProp) && check.assigned(hitProp)) { return 0.5; } - - // both present, values match => GREAT - if (check.assigned(textProp) && check.assigned(hitProp) && - textProp.toString().toLowerCase() === hitProp.toString().toLowerCase()) { return 1; } - - // ¯\_(ツ)_/¯ - return 0.7; -} - -/** - * Check various parts of the parsed text address - * against the results - * - * @param {object} text - * @param {string|number} [text.number] - * @param {string} [text.street] - * @param {string} [text.postalcode] - * @param {string} [text.state] - * @param {string} [text.country] - * @param {object} hit - * @param {object} [hit.address_parts] - * @param {string|number} [hit.address_parts.number] - * @param {string} [hit.address_parts.street] - * @param {string|number} [hit.address_parts.zip] - * @param {Array} [hit.parent.region_a] - * @param {Array} [hit.parent.country_a] - * @returns {number} - */ -function checkAddress(text, hit) { - var checkCount = 5; - var res = 0; - - if (check.assigned(text) && check.assigned(text.number) && check.assigned(text.street)) { - res += propMatch(text.number, (hit.address_parts ? hit.address_parts.number : null), false); - res += propMatch(text.street, (hit.address_parts ? hit.address_parts.street : null), false); - res += propMatch(text.postalcode, (hit.address_parts ? hit.address_parts.zip: null), true); - res += propMatch(text.state, (hit.parent.region_a ? hit.parent.region_a[0] : null), true); - res += propMatch(text.country, (hit.parent.country_a ? hit.parent.country_a[0] :null), true); - - res /= checkCount; - } - else { - res = 1; - } - - return res; -} - -/** - * z-scores have an effective range of -3.00 to +3.00. - * An average z-score is ZERO. - * A negative z-score indicates that the item/element is below - * average and a positive z-score means that the item/element - * in above average. When teachers say they are going to "curve" - * the test, they do this by computing z-scores for the students' test scores. - * - * @param {number} score - * @param {number} mean - * @param {number} stdev - * @returns {number} - */ -function computeZScore(score, mean, stdev) { - if (stdev < 0.01) { - return 0; - } - // because the effective range of z-scores is -3.00 to +3.00 - // add 10 to ensure a positive value, and then divide by 10+3+3 - // to further normalize to %-like result - return (((score - mean) / (stdev)) + 10) / 16; +function requestedStreet(req) { + // only street name was specified + return !req.clean.parsed_text.hasOwnProperty('number') && + req.clean.parsed_text.hasOwnProperty('street'); } -/** - * Computes standard deviation given an array of values - * - * @param {Array} scores - * @returns {number} - */ -function computeStandardDeviation(scores) { - var stdev = stats.stdev(scores); - // if stdev is low, just consider it 0 - return (stdev < 0.01) ? 0 : stdev; -} - - module.exports = setup; diff --git a/test/unit/middleware/confidenceScore.js b/test/unit/middleware/confidenceScore.js index 7d6ba87d..a0b4de6f 100644 --- a/test/unit/middleware/confidenceScore.js +++ b/test/unit/middleware/confidenceScore.js @@ -46,7 +46,8 @@ module.exports.tests.confidenceScore = function(test, common) { } }], meta: { - scores: [10] + scores: [10], + query_type: 'original' } }; @@ -86,7 +87,10 @@ module.exports.tests.confidenceScore = function(test, common) { county: ['city2'] } }], - meta: {scores: [10]} + meta: { + scores: [10], + query_type: 'original' + } }; confidenceScore(req, res, function() {}); @@ -119,13 +123,52 @@ module.exports.tests.confidenceScore = function(test, common) { county: ['city1'] } }], - meta: {scores: [10]} + meta: { + scores: [10], + query_type: 'original' + } }; confidenceScore(req, res, function() {}); t.equal(res.data[0].confidence, 0.28, 'score was set'); t.end(); }); + + test('should only work for original query_type', function(t) { + var req = { + clean: { + text: '123 Main St, City, NM', + parsed_text: { + number: 123, + street: 'Main St', + state: 'NM' + } + } + }; + var res = { + data: [{ + _score: 10, + found: true, + value: 1, + center_point: { lat: 100.1, lon: -50.5 }, + name: { default: 'test name1' }, + parent: { + country: ['country1'], + region: undefined, + region_a: undefined, + county: ['city1'] + } + }], + meta: { + scores: [10], + query_type: 'fallback' + } + }; + + confidenceScore(req, res, function() {}); + t.false(res.data[0].hasOwnProperty('confidence'), 'score was not set'); + t.end(); + }); }; module.exports.all = function (tape, common) { diff --git a/test/unit/middleware/confidenceScoreFallback.js b/test/unit/middleware/confidenceScoreFallback.js index a0b4de6f..5fcd03de 100644 --- a/test/unit/middleware/confidenceScoreFallback.js +++ b/test/unit/middleware/confidenceScoreFallback.js @@ -1,4 +1,4 @@ -var confidenceScore = require('../../../middleware/confidenceScore')(); +var confidenceScore = require('../../../middleware/confidenceScoreFallback')(); module.exports.tests = {}; @@ -89,16 +89,16 @@ module.exports.tests.confidenceScore = function(test, common) { }], meta: { scores: [10], - query_type: 'original' + query_type: 'fallback' } }; confidenceScore(req, res, function() {}); - t.equal(res.data[0].confidence, 0.6, 'score was set'); + t.equal(res.data[0].confidence, 0.1, 'score was set'); t.end(); }); - test('undefined region fields should be handled gracefully', function(t) { + test('no fallback addresses should have max score', function(t) { var req = { clean: { text: '123 Main St, City, NM', @@ -114,32 +114,31 @@ module.exports.tests.confidenceScore = function(test, common) { _score: 10, found: true, value: 1, + layer: 'address', center_point: { lat: 100.1, lon: -50.5 }, name: { default: 'test name1' }, parent: { country: ['country1'], - region: undefined, - region_a: undefined, + region: ['region1'], county: ['city1'] } }], meta: { scores: [10], - query_type: 'original' + query_type: 'fallback' } }; confidenceScore(req, res, function() {}); - t.equal(res.data[0].confidence, 0.28, 'score was set'); + t.equal(res.data[0].confidence, 1.0, 'max score was set'); t.end(); }); - test('should only work for original query_type', function(t) { + test('no fallback street query should have max score', function(t) { var req = { clean: { - text: '123 Main St, City, NM', + text: 'Main St, City, NM', parsed_text: { - number: 123, street: 'Main St', state: 'NM' } @@ -150,12 +149,12 @@ module.exports.tests.confidenceScore = function(test, common) { _score: 10, found: true, value: 1, + layer: 'street', center_point: { lat: 100.1, lon: -50.5 }, name: { default: 'test name1' }, parent: { country: ['country1'], - region: undefined, - region_a: undefined, + region: ['region1'], county: ['city1'] } }], @@ -166,7 +165,76 @@ module.exports.tests.confidenceScore = function(test, common) { }; confidenceScore(req, res, function() {}); - t.false(res.data[0].hasOwnProperty('confidence'), 'score was not set'); + t.equal(res.data[0].confidence, 1.0, 'max score was set'); + t.end(); + }); + + test('fallback to locality should have score deduction', function(t) { + var req = { + clean: { + text: '123 Main St, City, NM', + parsed_text: { + number: 123, + street: 'Main St', + state: 'NM' + } + } + }; + var res = { + data: [{ + _score: 10, + found: true, + value: 1, + layer: 'locality', + center_point: { lat: 100.1, lon: -50.5 }, + name: { default: 'test name1' }, + parent: { + country: ['country1'] + } + }], + meta: { + scores: [10], + query_type: 'fallback' + } + }; + + confidenceScore(req, res, function() {}); + t.equal(res.data[0].confidence, 0.6, 'score was set'); + t.end(); + }); + + test('fallback to country should have score deduction', function(t) { + var req = { + clean: { + text: '123 Main St, City, NM, USA', + parsed_text: { + number: 123, + street: 'Main St', + state: 'NM', + country: 'USA' + } + } + }; + var res = { + data: [{ + _score: 10, + found: true, + value: 1, + layer: 'country', + center_point: { lat: 100.1, lon: -50.5 }, + name: { default: 'test name1' }, + parent: { + country: ['country1'] + } + }], + meta: { + scores: [10], + query_type: 'fallback' + } + }; + + confidenceScore(req, res, function() {}); + t.equal(res.data[0].confidence, 0.1, 'score was set'); t.end(); }); };