diff --git a/controller/index.js b/controller/index.js deleted file mode 100644 index d900793f..00000000 --- a/controller/index.js +++ /dev/null @@ -1,31 +0,0 @@ - -var pkg = require('../package'); -var markdown = require('markdown').markdown; -var fs = require('fs'); - -function setup(){ - - var styleString = ''; - var text = '# Pelias API\n'; - text += '### Version: ['+ pkg.version+ '](https://github.com/pelias/api/releases)\n'; - text += fs.readFileSync( './DOCS.md', 'utf8'); - var indexHtml = styleString + markdown.toHTML(text); - - function controller( req, res, next ) { - if (req.accepts('html')) { - res.send(indexHtml); - return; - } - // default behaviour - res.json({ - name: pkg.name, - version: { - number: pkg.version - } - }); - } - - return controller; -} - -module.exports = setup; diff --git a/controller/markdownToHtml.js b/controller/markdownToHtml.js new file mode 100644 index 00000000..656afe01 --- /dev/null +++ b/controller/markdownToHtml.js @@ -0,0 +1,28 @@ + +var markdown = require('markdown').markdown; +var fs = require('fs'); + +function setup(peliasConfig, markdownFile){ + + var styleString = ''; + var text = '# Pelias API\n'; + text += '### Version: [' + peliasConfig.version + '](https://github.com/pelias/api/releases)\n'; + text += fs.readFileSync( markdownFile, 'utf8'); + var html = styleString + markdown.toHTML(text); + + function controller( req, res ) { + if (req.accepts('html')) { + res.send(html); + return; + } + // default behaviour + res.json({ + markdown: text, + html: html + }); + } + + return controller; +} + +module.exports = setup; diff --git a/controller/search.js b/controller/search.js index d70c1b5c..c4c54754 100644 --- a/controller/search.js +++ b/controller/search.js @@ -15,19 +15,20 @@ function setup( backend, query ){ body: query( req.clean ) }; - if (req.clean.type !== undefined) { + // ? + if( req.clean.hasOwnProperty('type') ){ cmd.type = req.clean.type; - delete req.clean.type; // remove type from clean to avoid clutter } // query backend - service.search( backend, cmd, function( err, docs ){ + service.search( backend, cmd, function( err, docs, meta ){ // error handler if( err ){ return next( err ); } req.results = { - data: docs + data: docs, + meta: meta }; next(); diff --git a/helper/geojsonify.js b/helper/geojsonify.js index a56b86eb..82c4cf88 100644 --- a/helper/geojsonify.js +++ b/helper/geojsonify.js @@ -18,7 +18,8 @@ var DETAILS_PROPS = [ 'county', 'localadmin', 'locality', - 'neighbourhood' + 'neighbourhood', + 'confidence' ]; diff --git a/helper/types.js b/helper/types.js index a87a65b8..7455476e 100644 --- a/helper/types.js +++ b/helper/types.js @@ -5,7 +5,7 @@ var valid_types = require( '../query/types' ); */ var intersection = function intersection(set1, set2) { return set2.filter(function(value) { - return set1.indexOf(value) !== -1; + return set1.indexOf(value) !== -1; }); }; @@ -14,7 +14,9 @@ module.exports = function calculate_types(clean_types) { return undefined; } - + /* the layers and source parameters are cumulative: + * perform a set insersection of their specified types + */ if (clean_types.from_layers || clean_types.from_source) { var types = valid_types; @@ -29,7 +31,11 @@ module.exports = function calculate_types(clean_types) { return types; } + /* + * Type restrictions requested by the address parser should only be used + * if both the source and layers parameters are empty, so do this last + */ if (clean_types.from_address_parser) { return clean_types.from_address_parser; } -}; +}; \ No newline at end of file diff --git a/middleware/confidenceScore.js b/middleware/confidenceScore.js new file mode 100644 index 00000000..6c55bc27 --- /dev/null +++ b/middleware/confidenceScore.js @@ -0,0 +1,255 @@ +/** + * + *Basic confidence score should be computed and returned for each item in the results. + * The score should range between 0-1, and take into consideration as many factors as possible. + * + * Some factors to consider: + * + * - number of results from ES + * - score of item within the range of highest-lowest scores from ES (within the returned set) + * - linguistic match of query + * - detection (or specification) of query type. i.e. an address shouldn't match an admin address. + */ + +var stats = require('stats-lite'); +var logger = require('pelias-logger').get('api'); + +var RELATIVE_SCORES = true; + +function setup(peliasConfig) { + RELATIVE_SCORES = peliasConfig.hasOwnProperty('relativeScores') ? peliasConfig.relativeScores : true; + return computeScores; +} + +function computeScores(req, res, next) { + // do nothing if no result data set + if (!req.results || !req.results.data || !req.results.meta) { + return next(); + } + + // compute standard deviation and mean from all scores + var scores = req.results.meta.scores; + var stdev = computeStandardDeviation(scores); + var mean = stats.mean(scores); + + // loop through data items and determine confidence scores + req.results.data = req.results.data.map(computeConfidenceScore.bind(null, req, mean, stdev)); + + next(); +} + +/** + * Check all types of things to determine how confident we are that this result + * is correct. Score is based on overall score distribution in the result set + * as well as how closely the result matches the input parameters. + * + * @param {object} req + * @param {number} mean + * @param {number} stdev + * @param {object} hit + * @returns {object} + */ +function computeConfidenceScore(req, mean, stdev, hit) { + var dealBreakers = checkForDealBreakers(req, hit); + if (dealBreakers) { + hit.confidence = 0.5; + return hit; + } + + var checkCount = 3; + hit.confidence = 0; + + if (RELATIVE_SCORES) { + checkCount += 2; + hit.confidence += checkDistanceFromMean(hit._score, mean, stdev); + hit.confidence += computeZScore(hit._score, mean, stdev); + } + hit.confidence += checkName(req.clean.input, req.clean.parsed_input, hit); + hit.confidence += checkQueryType(req.clean.parsed_input, hit); + hit.confidence += checkAddress(req.clean.parsed_input, hit); + + // TODO: look at categories and location + + hit.confidence /= checkCount; + + logger.debug('[confidence]:', hit.confidence, hit.name.default); + + return hit; +} + +function checkForDealBreakers(req, hit) { + if (!req.clean.parsed_input) { + return false; + } + + if (req.clean.parsed_input.state && req.clean.parsed_input.state !== hit.admin1_abbr) { + logger.debug('[confidence][deal-breaker]: state !== admin1_abbr'); + return true; + } + + if (req.clean.parsed_input.postalcode && req.clean.parsed_input.postalcode !== hit.zip) { + logger.debug('[confidence][deal-breaker]: postalcode !== zip'); + return true; + } +} + +/** + * Check how statistically significant the score of this result is + * given mean and standard deviation + * + * @param {number} score + * @param {number} mean + * @param {number} stdev + * @returns {number} + */ +function checkDistanceFromMean(score, mean, stdev) { + return (score - mean) > stdev ? 1 : 0; +} + +/** + * Compare input string or name component of parsed_input against + * default name in result + * + * @param {string} input + * @param {object|undefined} parsed_input + * @param {object} hit + * @returns {number} + */ +function checkName(input, parsed_input, hit) { + // parsed_input name should take precedence if available since it's the cleaner name property + if (parsed_input && parsed_input.name && hit.name.default.toLowerCase() === parsed_input.name.toLowerCase()) { + return 1; + } + + // if no parsed_input check the input value as provided against result's default name + if (hit.name.default.toLowerCase() === input.toLowerCase()) { + return 1; + } + + // if no matches detected, don't judge too harshly since it was a longshot anyway + return 0.7; +} + +/** + * Input being set indicates the query was for an address + * check if house number was specified and found in result + * + * @param {object|undefined} input + * @param {object} hit + * @returns {number} + */ +function checkQueryType(input, hit) { + if (!!input.number && (!hit.address || (hit.address && !hit.address.number))) { + return 0; + } + return 1; +} + +/** + * Determine the quality of the property match + * + * @param {string|number|undefined|null} inputProp + * @param {string|number|undefined|null} hitProp + * @param {boolean} expectEnriched + * @returns {number} + */ +function propMatch(inputProp, hitProp, expectEnriched) { + + // both missing, but expect to have enriched value in result => BAD + if (!inputProp && !hitProp && expectEnriched) { return 0; } + + // both missing, and no enrichment expected => GOOD + if (!inputProp && !hitProp) { return 1; } + + // input has it, result doesn't => BAD + if (inputProp && !hitProp) { return 0; } + + // input missing, result has it, and enrichment is expected => GOOD + if (!inputProp && hitProp && expectEnriched) { return 1; } + + // input missing, result has it, enrichment not desired => 50/50 + if (!inputProp && hitProp) { return 0.5; } + + // both present, values match => GREAT + if (inputProp && hitProp && inputProp.toString().toLowerCase() === hitProp.toString().toLowerCase()) { return 1; } + + // ¯\_(ツ)_/¯ + return 0.7; +} + +/** + * Check various parts of the parsed input address + * against the results + * + * @param {object} input + * @param {string|number} [input.number] + * @param {string} [input.street] + * @param {string} [input.postalcode] + * @param {string} [input.state] + * @param {string} [input.country] + * @param {object} hit + * @param {object} [hit.address] + * @param {string|number} [hit.address.number] + * @param {string} [hit.address.street] + * @param {string|number} [hit.zip] + * @param {string} [hit.admin1_abbr] + * @param {string} [hit.alpha3] + * @returns {number} + */ +function checkAddress(input, hit) { + var checkCount = 5; + var res = 0; + + if (input && input.number && input.street) { + res += propMatch(input.number, (hit.address ? hit.address.number : null), false); + res += propMatch(input.street, (hit.address ? hit.address.street : null), false); + res += propMatch(input.postalcode, (hit.address ? hit.address.zip: null), true); + res += propMatch(input.state, hit.admin1_abbr, true); + res += propMatch(input.country, hit.alpha3, true); + + res /= checkCount; + } + else { + res = 1; + } + + return res; +} + +/** + * z-scores have an effective range of -3.00 to +3.00. + * An average z-score is ZERO. + * A negative z-score indicates that the item/element is below + * average and a positive z-score means that the item/element + * in above average. When teachers say they are going to "curve" + * the test, they do this by computing z-scores for the students' test scores. + * + * @param {number} score + * @param {number} mean + * @param {number} stdev + * @returns {number} + */ +function computeZScore(score, mean, stdev) { + if (stdev < 0.01) { + return 0; + } + // because the effective range of z-scores is -3.00 to +3.00 + // add 10 to ensure a positive value, and then divide by 10+3+3 + // to further normalize to %-like result + return (((score - mean) / (stdev)) + 10) / 16; +} + +/** + * Computes standard deviation given an array of values + * + * @param {Array} scores + * @returns {number} + */ +function computeStandardDeviation(scores) { + var stdev = stats.stdev(scores); + // if stdev is low, just consider it 0 + return (stdev < 0.01) ? 0 : stdev; +} + + +module.exports = setup; diff --git a/middleware/geocodeJSON.js b/middleware/geocodeJSON.js index da6cfdf6..d2f58b46 100644 --- a/middleware/geocodeJSON.js +++ b/middleware/geocodeJSON.js @@ -25,16 +25,10 @@ function convertToGeocodeJSON(peliasConfig, req, next) { // the GeocodeJSON spec that is implemented by this instance. req.results.geojson.geocoding.version = '0.1'; - // OPTIONAL. Default: null. The licence of the data. In case of multiple sources, - // and then multiple licences, can be an object with one key by source. - // Can be a freeform text property describing the licensing details. - // Can be a URI on the server, which outlines licensing details. - req.results.geojson.geocoding.license = peliasConfig.host + '/license'; - // OPTIONAL. Default: null. The attribution of the data. In case of multiple sources, // and then multiple attributions, can be an object with one key by source. // Can be a URI on the server, which outlines attribution details. - req.results.geojson.geocoding.attribution = peliasConfig.host + '/attribution'; + req.results.geojson.geocoding.attribution = peliasConfig.host + 'attribution'; // OPTIONAL. Default: null. The query that has been issued to trigger the // search. diff --git a/package.json b/package.json index 5c90c84e..50dc376b 100644 --- a/package.json +++ b/package.json @@ -52,6 +52,7 @@ "pelias-query": "^1.1.0", "pelias-schema": "1.0.0", "pelias-suggester-pipeline": "2.0.2", + "stats-lite": "^1.0.3", "through2": "0.6.5" }, "devDependencies": { diff --git a/DOCS.md b/public/apiDoc.md similarity index 100% rename from DOCS.md rename to public/apiDoc.md diff --git a/public/attribution.md b/public/attribution.md new file mode 100644 index 00000000..db9c392e --- /dev/null +++ b/public/attribution.md @@ -0,0 +1,7 @@ +## Attribution +* Geocoding by [Pelias](https://mapzen.com/pelias) from [Mapzen](https://mapzen.com) +* Data from + * [OpenStreetMap](http://www.openstreetmap.org/copyright) © OpenStreetMap contributors under [ODbL](http://opendatacommons.org/licenses/odbl/) + * [Quattroshapes](https://github.com/foursquare/quattroshapes/blob/master/LICENSE.md) under [CC-BY-2.0](https://creativecommons.org/licenses/by/2.0/) + * [GeoNames](http://www.geonames.org/) under [CC-BY-3.0](https://creativecommons.org/licenses/by/2.0/) + * and other sources diff --git a/routes/v1.js b/routes/v1.js index d5a1c183..24ecccf2 100644 --- a/routes/v1.js +++ b/routes/v1.js @@ -1,3 +1,4 @@ +var express = require('express'); var Router = require('express').Router; var reverseQuery = require('../query/reverse'); @@ -16,7 +17,7 @@ var middleware = { /** ----------------------- controllers ----------------------- **/ var controllers = { - index: require('../controller/index'), + mdToHTML: require('../controller/markdownToHtml'), place: require('../controller/place'), search: require('../controller/search') }; @@ -24,6 +25,7 @@ var controllers = { /** ----------------------- controllers ----------------------- **/ var postProc = { + confidenceScores: require('../middleware/confidenceScore'), renamePlacenames: require('../middleware/renamePlacenames'), geocodeJSON: require('../middleware/geocodeJSON'), sendJSON: require('../middleware/sendJSON') @@ -41,12 +43,16 @@ function addRoutes(app, peliasConfig) { var routers = { index: createRouter([ - controllers.index() + controllers.mdToHTML(peliasConfig, './public/apiDoc.md') + ]), + attribution: createRouter([ + controllers.mdToHTML(peliasConfig, './public/attribution.md') ]), search: createRouter([ sanitisers.search.middleware, middleware.types, controllers.search(), + postProc.confidenceScores(peliasConfig), postProc.renamePlacenames(), postProc.geocodeJSON(peliasConfig), postProc.sendJSON @@ -54,6 +60,7 @@ function addRoutes(app, peliasConfig) { reverse: createRouter([ sanitisers.reverse.middleware, controllers.search(undefined, reverseQuery), + // TODO: add confidence scores postProc.renamePlacenames(), postProc.geocodeJSON(peliasConfig), postProc.sendJSON @@ -72,6 +79,7 @@ function addRoutes(app, peliasConfig) { // api root app.get ( base, routers.index ); + app.get ( base + 'attribution', routers.attribution ); app.get ( base + 'place', routers.place ); app.get ( base + 'autocomplete', routers.search ); app.get ( base + 'search', routers.search ); diff --git a/service/search.js b/service/search.js index 15b5cd14..1e77f69f 100644 --- a/service/search.js +++ b/service/search.js @@ -23,20 +23,28 @@ function service( backend, cmd, cb ){ // map returned documents var docs = []; + var meta = { + scores: [] + }; + if( data && data.hits && data.hits.total && Array.isArray(data.hits.hits)){ + docs = data.hits.hits.map( function( hit ){ + meta.scores.push(hit._score); + // map metadata in to _source so we // can serve it up to the consumer hit._source._id = hit._id; hit._source._type = hit._type; + hit._source._score = hit._score; return hit._source; }); } // fire callback - return cb( null, docs ); + return cb( null, docs, meta ); }); } diff --git a/test/unit/controller/index.js b/test/unit/controller/index.js index f59dacd4..cffe1030 100644 --- a/test/unit/controller/index.js +++ b/test/unit/controller/index.js @@ -1,19 +1,19 @@ -var setup = require('../../../controller/index'); +var setup = require('../../../controller/markdownToHtml'); module.exports.tests = {}; module.exports.tests.interface = function(test, common) { test('valid interface', function(t) { t.equal(typeof setup, 'function', 'setup is a function'); - t.equal(typeof setup(), 'function', 'setup returns a controller'); + t.equal(typeof setup({}, './public/apiDoc.md'), 'function', 'setup returns a controller'); t.end(); }); }; module.exports.tests.info_json = function(test, common) { test('returns server info in json', function(t) { - var controller = setup(); + var controller = setup({}, './public/attribution.md'); var req = { accepts: function (format) { t.equal(format, 'html', 'check for Accepts:html'); @@ -22,9 +22,8 @@ module.exports.tests.info_json = function(test, common) { }; var res = { json: function( json ){ t.equal(typeof json, 'object', 'returns json'); - t.equal(typeof json.name, 'string', 'name'); - t.equal(typeof json.version, 'object', 'version'); - t.equal(typeof json.version.number, 'string', 'version number'); + t.assert(json.hasOwnProperty('markdown'), 'return object contains markdown property'); + t.assert(json.hasOwnProperty('html'), 'return object contains html property'); t.end(); }}; controller( req, res ); @@ -33,21 +32,24 @@ module.exports.tests.info_json = function(test, common) { module.exports.tests.info_html = function(test, common) { test('returns server info in html', function(t) { + var filePath = './foo.md'; var style = ''; var mockText = 'this text should show up in the html content'; var fsMock = { readFileSync: function (path, format) { - t.equal(path, './DOCS.md', 'open DOCS.md file'); + t.equal(path, filePath, 'open specified file'); t.equal(format, 'utf8', 'file format'); return mockText; } }; var proxyquire = require('proxyquire'); - var setup = proxyquire('../../../controller/index', { 'fs': fsMock }); + var setup = proxyquire('../../../controller/markdownToHtml', { 'fs': fsMock }); - var controller = setup(); + var config = { version: '1.1.1' }; + + var controller = setup(config, filePath); var req = { accepts: function () { return true; diff --git a/test/unit/mock/backend.js b/test/unit/mock/backend.js index fe617fe7..201ab7b5 100644 --- a/test/unit/mock/backend.js +++ b/test/unit/mock/backend.js @@ -10,6 +10,7 @@ responses['client/search/ok/1'] = function( cmd, cb ){ return cb( undefined, searchEnvelope([{ _id: 'myid1', _type: 'mytype1', + _score: 10, _source: { value: 1, center_point: { lat: 100.1, lon: -50.5 }, @@ -19,6 +20,7 @@ responses['client/search/ok/1'] = function( cmd, cb ){ }, { _id: 'myid2', _type: 'mytype2', + _score: 20, _source: { value: 2, center_point: { lat: 100.2, lon: -51.5 }, @@ -35,6 +37,7 @@ responses['client/mget/ok/1'] = function( cmd, cb ){ return cb( undefined, mgetEnvelope([{ _id: 'myid1', _type: 'mytype1', + _score: 10, found: true, _source: { value: 1, @@ -45,6 +48,7 @@ responses['client/mget/ok/1'] = function( cmd, cb ){ }, { _id: 'myid2', _type: 'mytype2', + _score: 20, found: true, _source: { value: 2, diff --git a/test/unit/service/search.js b/test/unit/service/search.js index e1f97541..a7212775 100644 --- a/test/unit/service/search.js +++ b/test/unit/service/search.js @@ -19,6 +19,7 @@ module.exports.tests.functional_success = function(test, common) { var expected = [ { _id: 'myid1', _type: 'mytype1', + _score: 10, value: 1, center_point: { lat: 100.1, lon: -50.5 }, name: { default: 'test name1' }, @@ -26,6 +27,7 @@ module.exports.tests.functional_success = function(test, common) { }, { _id: 'myid2', _type: 'mytype2', + _score: 20, value: 2, center_point: { lat: 100.2, lon: -51.5 }, name: { default: 'test name2' },