diff --git a/helper/labelGenerator.js b/helper/labelGenerator.js index 20c73a98..97993e03 100644 --- a/helper/labelGenerator.js +++ b/helper/labelGenerator.js @@ -16,18 +16,27 @@ module.exports = function( record ){ // retain only things that are truthy labelParts = _.compact(labelParts); - // first, dedupe the name and 1st label array elements - // this is used to ensure that the `name` and first admin hierarchy elements aren't repeated - // eg - `["Lancaster", "Lancaster", "PA", "United States"]` -> `["Lancaster", "PA", "United States"]` - var dedupedNameAndFirstLabelElement = _.uniq([labelParts.shift(), labelParts.shift()]); + // third, dedupe and join with a comma and return + return dedupeNameAndFirstLabelElement(labelParts).join(', '); - // second, unshift the deduped parts back onto the labelParts - labelParts.unshift.apply(labelParts, dedupedNameAndFirstLabelElement); +}; - // third, join with a comma and return - return labelParts.join(', '); +function dedupeNameAndFirstLabelElement(labelParts) { + // only dedupe if a result has more than a name (the first label part) + if (labelParts.length > 1) { + // first, dedupe the name and 1st label array elements + // this is used to ensure that the `name` and first admin hierarchy elements aren't repeated + // eg - `["Lancaster", "Lancaster", "PA", "United States"]` -> `["Lancaster", "PA", "United States"]` + var deduped = _.uniq([labelParts.shift(), labelParts.shift()]); -}; + // second, unshift the deduped parts back onto the labelParts + labelParts.unshift.apply(labelParts, deduped); + + } + + return labelParts; + +} function getSchema(country_a) { if (country_a && country_a.length && schemas[country_a]) { diff --git a/helper/text_parser.js b/helper/text_parser.js deleted file mode 100644 index 0db8bede..00000000 --- a/helper/text_parser.js +++ /dev/null @@ -1,86 +0,0 @@ - -var parser = require('addressit'); -var extend = require('extend'); -var type_mapping = require('../helper/type_mapping'); -var check = require('check-types'); -var logger = require('pelias-logger').get('api'); - -var DELIM = ','; - -/* - * For performance, and to prefer POI and admin records, express a preference - * to only search coarse layers on very short text inputs. - */ -module.exports.get_layers = function get_layers(query) { - if (query.length <= 3 ) { - // no address parsing required - return type_mapping.layer_mapping.coarse; - } -}; - -module.exports.get_parsed_address = function get_parsed_address(query) { - - var getAdminPartsBySplittingOnDelim = function(queryParts) { - // naive approach - for admin matching during query time - // split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny' - - var address = {}; - - if (queryParts.length > 1) { - address.name = queryParts[0].trim(); - - // 1. slice away all parts after the first one - // 2. trim spaces from each part just in case - // 3. join the parts back together with appropriate delimiter and spacing - address.admin_parts = queryParts.slice(1) - .map(function (part) { return part.trim(); }) - .join(DELIM + ' '); - } - - return address; - }; - - var getAddressParts = function(query) { - // perform full address parsing - // except on queries so short they obviously can't contain an address - if (query.length > 3) { - return parser( query ); - } - }; - - var queryParts = query.split(DELIM); - - var addressWithAdminParts = getAdminPartsBySplittingOnDelim(queryParts); - var addressWithAddressParts= getAddressParts(queryParts.join(DELIM + ' ')); - - var parsedAddress = extend(addressWithAdminParts, - addressWithAddressParts); - - var address_parts = [ 'name', - 'number', - 'street', - 'city', - 'state', - 'country', - 'postalcode', - 'regions', - 'admin_parts' - ]; - - var parsed_text = {}; - - address_parts.forEach(function(part){ - if (parsedAddress[part]) { - parsed_text[part] = parsedAddress[part]; - } - }); - - // if all we found was regions, ignore it as it is not enough information to make smarter decisions - if (Object.keys(parsed_text).length === 1 && !check.undefined(parsed_text.regions)) - { - logger.info('Ignoring address parser output, regions only'); - return null; - } - - return parsed_text; -}; diff --git a/helper/type_mapping.js b/helper/type_mapping.js index 0b20c111..ed20c0d8 100644 --- a/helper/type_mapping.js +++ b/helper/type_mapping.js @@ -49,7 +49,8 @@ var LAYERS_BY_SOURCE = { openaddresses: [ 'address' ], geonames: [ 'country', 'region', 'county', 'locality', 'venue' ], whosonfirst: [ 'continent', 'country', 'dependency', 'macroregion', 'region', - 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'neighbourhood', 'microhood', 'disputed'] + 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'borough', + 'neighbourhood', 'microhood', 'disputed'] }; /* diff --git a/middleware/geocodeJSON.js b/middleware/geocodeJSON.js index 80d69e86..414d216b 100644 --- a/middleware/geocodeJSON.js +++ b/middleware/geocodeJSON.js @@ -16,7 +16,7 @@ function setup(peliasConfig, basePath) { config: peliasConfig || require('pelias-config').generate().api, basePath: basePath || '/' }; - + function middleware(req, res, next) { return convertToGeocodeJSON(req, res, next, opts); } @@ -56,6 +56,10 @@ function convertToGeocodeJSON(req, res, next, opts) { // Helpful for debugging and understanding how the input impacts results. res.body.geocoding.query = req.clean; + // remove arrays produced by the tokenizer (only intended to be used internally). + delete res.body.geocoding.query.tokens_complete; + delete res.body.geocoding.query.tokens_incomplete; + // OPTIONAL. Warnings and errors. addMessages(req, 'warnings', res.body.geocoding); addMessages(req, 'errors', res.body.geocoding); diff --git a/middleware/options.js b/middleware/options.js index 57c0e1eb..f6ca8767 100644 --- a/middleware/options.js +++ b/middleware/options.js @@ -9,10 +9,10 @@ function middleware(req, res, next){ if( req.method === 'OPTIONS' ){ - res.send(200); + res.sendStatus(200); } else { next(); } } -module.exports = middleware; \ No newline at end of file +module.exports = middleware; diff --git a/package.json b/package.json index 0c0cc1f7..f70cb57b 100644 --- a/package.json +++ b/package.json @@ -35,7 +35,6 @@ "elasticsearch": ">=1.2.1" }, "dependencies": { - "addressit": "git://github.com/dianashk/addressit.git#temp", "async": "^1.5.2", "check-types": "^6.0.0", "cluster2": "git://github.com/missinglink/cluster2.git#node_zero_twelve", @@ -53,9 +52,10 @@ "morgan": "1.7.0", "pelias-config": "^1.0.1", "pelias-logger": "^0.0.8", - "pelias-model": "^3.1.0", - "pelias-query": "6.2.0", + "pelias-model": "^4.0.0", + "pelias-query": "6.3.0", "pelias-suggester-pipeline": "2.0.4", + "pelias-text-analyzer": "^1.0.1", "stats-lite": "1.0.3", "through2": "2.0.1" }, @@ -68,7 +68,7 @@ "precommit-hook": "^3.0.0", "proxyquire": "^1.4.0", "tap-dot": "1.0.5", - "tape": "^4.4.0" + "tape": "^4.5.1" }, "pre-commit": [ "lint", diff --git a/query/autocomplete.js b/query/autocomplete.js index ffc57396..6d5863a8 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -9,7 +9,9 @@ var views = { ngrams_strict: require('./view/ngrams_strict'), focus_selected_layers: require('./view/focus_selected_layers'), ngrams_last_token_only: require('./view/ngrams_last_token_only'), - phrase_first_tokens_only: require('./view/phrase_first_tokens_only') + phrase_first_tokens_only: require('./view/phrase_first_tokens_only'), + pop_subquery: require('./view/pop_subquery'), + boost_exact_matches: require('./view/boost_exact_matches') }; //------------------------------ @@ -32,14 +34,16 @@ query.score( peliasQuery.view.admin('country_a') ); query.score( peliasQuery.view.admin('region') ); query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin('county') ); +query.score( peliasQuery.view.admin('borough') ); query.score( peliasQuery.view.admin('localadmin') ); query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); // scoring boost +query.score( views.boost_exact_matches ); query.score( views.focus_selected_layers( views.ngrams_strict ) ); -query.score( peliasQuery.view.popularity( views.ngrams_strict ) ); -query.score( peliasQuery.view.population( views.ngrams_strict ) ); +query.score( peliasQuery.view.popularity( views.pop_subquery ) ); +query.score( peliasQuery.view.population( views.pop_subquery ) ); // non-scoring hard filters query.filter( peliasQuery.view.sources ); @@ -59,29 +63,28 @@ function generateQuery( clean ){ vs.var( 'sources', clean.sources ); } - // mark the name as incomplete (user has not yet typed a comma) - vs.var( 'input:name:isComplete', false ); - - // perform some operations on 'clean.text': - // 1. if there is a space followed by a single char, remove them. - // - this is required as the index uses 2grams and sending 1grams - // - to a 2gram index when using 'type:phrase' or 'operator:and' will - // - result in a complete failure of the query. - // 2. trim leading and trailing whitespace. - var text = clean.text.replace(/( .$)/g,'').trim(); - - // if the input parser has run and suggested a 'parsed_text.name' to use. - if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ - - // mark the name as complete (user has already typed a comma) - vs.var( 'input:name:isComplete', true ); - - // use 'parsed_text.name' instead of 'clean.text'. - text = clean.parsed_text.name; + // pass the input tokens to the views so they can choose which tokens + // are relevant for their specific function. + if( check.array( clean.tokens ) ){ + vs.var( 'input:name:tokens', clean.tokens ); + vs.var( 'input:name:tokens_complete', clean.tokens_complete ); + vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete ); } // input text - vs.var( 'input:name', text ); + vs.var( 'input:name', clean.text ); + + // if the tokenizer has run then we set 'input:name' to as the combination of the + // 'complete' tokens with the 'incomplete' tokens, the resuting array differs + // slightly from the 'input:name:tokens' array as some tokens might have been + // removed in the process; such as single grams which are not present in then + // ngrams index. + if( check.array( clean.tokens_complete ) && check.array( clean.tokens_incomplete ) ){ + var combined = clean.tokens_complete.concat( clean.tokens_incomplete ); + if( combined.length ){ + vs.var( 'input:name', combined.join(' ') ); + } + } // focus point if( check.number(clean['focus.point.lat']) && diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index ba52a049..08e33aeb 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -20,20 +20,20 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasPhrase', + 'ngram:analyzer': 'peliasQueryPartialToken', 'ngram:field': 'name.default', 'ngram:boost': 100, - 'phrase:analyzer': 'peliasPhrase', - 'phrase:field': 'phrase.default', + 'phrase:analyzer': 'peliasQueryFullToken', + 'phrase:field': 'name.default', 'phrase:boost': 1, - 'phrase:slop': 2, + 'phrase:slop': 3, 'focus:function': 'linear', 'focus:offset': '0km', 'focus:scale': '250km', 'focus:decay': 0.5, - 'focus:weight': 10, + 'focus:weight': 40, 'function_score:score_mode': 'avg', 'function_score:boost_mode': 'multiply', @@ -82,6 +82,10 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:neighbourhood:field': 'parent.neighbourhood', 'admin:neighbourhood:boost': 200, + 'admin:borough:analyzer': 'peliasAdmin', + 'admin:borough:field': 'parent.borough', + 'admin:borough:boost': 600, + 'popularity:field': 'popularity', 'popularity:modifier': 'log1p', 'popularity:max_boost': 20, diff --git a/query/reverse_defaults.js b/query/reverse_defaults.js index 306efaac..06ad6400 100644 --- a/query/reverse_defaults.js +++ b/query/reverse_defaults.js @@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasOneEdgeGram', + 'ngram:analyzer': 'peliasQueryPartialToken', 'ngram:field': 'name.default', 'ngram:boost': 1, diff --git a/query/search.js b/query/search.js index 9f0a792c..77fcb3f5 100644 --- a/query/search.js +++ b/query/search.js @@ -30,6 +30,7 @@ query.score( peliasQuery.view.admin('country_a') ); query.score( peliasQuery.view.admin('region') ); query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin('county') ); +query.score( peliasQuery.view.admin('borough') ); query.score( peliasQuery.view.admin('localadmin') ); query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); diff --git a/query/search_defaults.js b/query/search_defaults.js index ea0dc87f..281d25ae 100644 --- a/query/search_defaults.js +++ b/query/search_defaults.js @@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasOneEdgeGram', + 'ngram:analyzer': 'peliasIndexOneEdgeGram', 'ngram:field': 'name.default', 'ngram:boost': 1, @@ -78,6 +78,10 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:locality:field': 'parent.locality', 'admin:locality:boost': 1, + 'admin:borough:analyzer': 'peliasAdmin', + 'admin:borough:field': 'parent.borough', + 'admin:borough:boost': 1, + 'admin:neighbourhood:analyzer': 'peliasAdmin', 'admin:neighbourhood:field': 'parent.neighbourhood', 'admin:neighbourhood:boost': 1, diff --git a/query/text_parser.js b/query/text_parser.js index 38fca48e..00e60724 100644 --- a/query/text_parser.js +++ b/query/text_parser.js @@ -1,20 +1,15 @@ var logger = require('pelias-logger').get('api'); +var placeTypes = require('../helper/placeTypes'); /* This list should only contain admin fields we are comfortable matching in the case when we can't identify parts of an address. This shouldn't contain fields like country_a or postalcode because we should only try to match those when we're sure that's what they are. */ -var adminFields = [ - 'country', - 'region', - 'region_a', - 'county', - 'localadmin', - 'locality', - 'neighbourhood' -]; +var adminFields = placeTypes.concat([ + 'region_a' +]); /** @todo: refactor me @@ -101,4 +96,4 @@ function addParsedVariablesToQueryVariables( parsed_text, vs ){ } } -module.exports = addParsedVariablesToQueryVariables; \ No newline at end of file +module.exports = addParsedVariablesToQueryVariables; diff --git a/query/view/boost_exact_matches.js b/query/view/boost_exact_matches.js new file mode 100644 index 00000000..9af56cfb --- /dev/null +++ b/query/view/boost_exact_matches.js @@ -0,0 +1,40 @@ + +var peliasQuery = require('pelias-query'), + searchDefaults = require('../search_defaults'); + +/** + This view (unfortunately) requires autocomplete to use the phrase.* index. + + ideally we wouldn't need to use this, but at time of writing we are unable + to distinguish between 'complete tokens' and 'grams' in the name.* index. + + this view was introduced in order to score exact matches higher than partial + matches, without it we find results such as "Clayton Avenue" appearing first + in the results list for the query "Clay Av". + + the view uses some of the values from the 'search_defaults.js' file to add an + additional 'SHOULD' condition which scores exact matches slighly higher + than partial matches. +**/ + +module.exports = function( vs ){ + + // make a copy of the variables so we don't interfere with the values + // passed to other views. + var vsCopy = new peliasQuery.Vars( vs.export() ); + + // copy phrase:* values from search defaults + vsCopy.var('phrase:analyzer').set(searchDefaults['phrase:analyzer']); + vsCopy.var('phrase:field').set(searchDefaults['phrase:field']); + + // get a copy of the *complete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_complete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } + + // set 'input:name' to be only the fully completed characters + vsCopy.var('input:name').set( tokens.join(' ') ); + + return peliasQuery.view.phrase( vsCopy ); +}; diff --git a/query/view/ngrams_last_token_only.js b/query/view/ngrams_last_token_only.js index 3e3315f7..2665c294 100644 --- a/query/view/ngrams_last_token_only.js +++ b/query/view/ngrams_last_token_only.js @@ -8,9 +8,6 @@ var peliasQuery = require('pelias-query'), eg. if the input was "100 foo str", then 'input:name' would only be 'str' note: it is assumed that the rest of the input is matched using another view. - there is an additional flag 'input:name:isComplete' used to disable this view - selectively, see that section for more info. - code notes: this view makes a copy of the $vs object in order to change their values without mutating the original values, which may be expected in their unaltered form by other views. @@ -18,19 +15,17 @@ var peliasQuery = require('pelias-query'), module.exports = function( vs ){ - // Totally disable this view when bool value 'input:name:isComplete' is true. - // This is the case when the user has typed a comma, so we can assume - // that the 'name' part of the query is now complete. - if( vs.var('input:name:isComplete').get() ){ return null; } + // get a copy of the *tokens_incomplete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_incomplete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } // make a copy Vars so we don't mutate the original var vsCopy = new peliasQuery.Vars( vs.export() ); - // get the input 'name' variable - var name = vs.var('input:name').get(); - // set the 'name' variable in the copy to only the last token - vsCopy.var('input:name').set( name.substr( name.lastIndexOf(' ')+1 ) ); + vsCopy.var('input:name').set( tokens.join(' ') ); // return the view rendered using the copy return ngrams_strict( vsCopy ); diff --git a/query/view/phrase_first_tokens_only.js b/query/view/phrase_first_tokens_only.js index b047b30f..7ab4539b 100644 --- a/query/view/phrase_first_tokens_only.js +++ b/query/view/phrase_first_tokens_only.js @@ -7,9 +7,6 @@ var peliasQuery = require('pelias-query'); eg. if the input was "100 foo str", then 'input:name' would only be '100 foo' note: it is assumed that the rest of the input is matched using another view. - there is an additional flag 'input:name:isComplete' used to disable this view - selectively, see that section for more info. - code notes: this view makes a copy of the $vs object in order to change their values without mutating the original values, which may be expected in their unaltered form by other views. @@ -17,27 +14,17 @@ var peliasQuery = require('pelias-query'); module.exports = function( vs ){ - // Don't mutate the name variable when 'input:name:isComplete' is true. - // This is the case when the user has typed a comma, so we can assume - // that the 'name' part of the query is now complete. - if( vs.var('input:name:isComplete').get() ){ - // return the view rendered using the original vars - return peliasQuery.view.phrase( vs ); - } + // get a copy of the *complete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_complete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } // make a copy Vars so we don't mutate the original var vsCopy = new peliasQuery.Vars( vs.export() ); - // get the input 'name' variable and split in to tokens - var name = vs.var('input:name').get(), - tokens = name.split(' '); - - // single token only, abort (we don't want the *last* token) - // return null here will completely disable the view. - if( tokens.length < 2 ){ return null; } - // set the 'name' variable in the copy to all but the last token - vsCopy.var('input:name').set( name.substr( 0, name.lastIndexOf(' ') ) ); + vsCopy.var('input:name').set( tokens.join(' ') ); // return the view rendered using the copy return peliasQuery.view.phrase( vsCopy ); diff --git a/query/view/pop_subquery.js b/query/view/pop_subquery.js new file mode 100644 index 00000000..724b773f --- /dev/null +++ b/query/view/pop_subquery.js @@ -0,0 +1,17 @@ + +var peliasQuery = require('pelias-query'), + check = require('check-types'); + +/** + Population / Popularity subquery +**/ + +module.exports = function( vs ){ + + var view = peliasQuery.view.ngrams( vs ); + + view.match['name.default'].analyzer = vs.var('phrase:analyzer'); + delete view.match['name.default'].boost; + + return view; +}; diff --git a/sanitiser/_text.js b/sanitiser/_text.js index e6897a5e..4709eeee 100644 --- a/sanitiser/_text.js +++ b/sanitiser/_text.js @@ -1,5 +1,5 @@ var check = require('check-types'), - text_parser = require('../helper/text_parser'); + text_analyzer = require('pelias-text-analyzer'); // validate texts, convert types and apply defaults function sanitize( raw, clean ){ @@ -19,7 +19,7 @@ function sanitize( raw, clean ){ clean.text = raw.text; // parse text with query parser - var parsed_text = text_parser.get_parsed_address(clean.text); + var parsed_text = text_analyzer.parse(clean.text); if (check.assigned(parsed_text)) { clean.parsed_text = parsed_text; } diff --git a/sanitiser/_tokenizer.js b/sanitiser/_tokenizer.js new file mode 100644 index 00000000..3312ea05 --- /dev/null +++ b/sanitiser/_tokenizer.js @@ -0,0 +1,112 @@ + +var check = require('check-types'); + +/** + simplified version of the elaticsearch tokenizer, used in order to + be able to detect which tokens are 'complete' (user has finished typing them) + or 'incomplete' (the user has possibly only typed part of the token). + + note: we don't need to strip punctuation as that will be handled on the + elasticsearch side, so sending a token such as 'st.' is not an issue, these + tokens should *not* be modified as the anaylsis can use the punctuation to + infer meaning. + + note: this sanitizer should run *after* the '_text' sanitizer so it can + use the output of clean.parsed_text where available. +**/ +function sanitize( raw, clean ){ + + // error & warning messages + var messages = { errors: [], warnings: [] }; + + // this is the string we will use for analysis + var text = clean.text; + + // a boolean to track whether the input parser successfully ran; or not. + var inputParserRanSuccessfully = false; + + // if the text parser has run then we only tokenize the 'name' section + // of the 'parsed_text' object, ignoring the 'admin' parts. + if( clean.hasOwnProperty('parsed_text') ) { + inputParserRanSuccessfully = true; + + // parsed_text.name is set, this is the highest priority, use this string + if( clean.parsed_text.hasOwnProperty('name') ){ + text = clean.parsed_text.name; // use this string instead + } + + // else handle the case where parsed_text.street was produced but + // no parsed_text.name is produced. + // additionally, handle the case where parsed_text.number is present + // note: the addressit module may also produce parsed_text.unit info + // for now, we discard that information as we don't have an appropriate + else if( clean.parsed_text.hasOwnProperty('street') ){ + text = [ + clean.parsed_text.number, + clean.parsed_text.street + ].filter(function(el){return el;}) + .join(' '); // remove empty elements + } + } + + // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors. + clean.tokens = []; + clean.tokens_complete = []; + clean.tokens_incomplete = []; + + // sanity check that the text is valid. + if( check.nonEmptyString( text ) ){ + + // split according to the regex used in the elasticsearch tokenizer + // see: https://github.com/pelias/schema/blob/master/settings.js + // see: settings.analysis.tokenizer.peliasNameTokenizer + clean.tokens = text + .split(/[\s,\\\/]+/) // split on delimeters + .filter(function(el){return el;}); // remove empty elements + } + + /** + the following section splits the tokens in to two arrays called + 'tokens_complete' and 'tokens_incomplete'. + + it also strips any tokens from 'tokens_incomplete' which might not + match the ngrams index (such as single grams not stored in the index). + **/ + + // split the tokens in to 'complete' and 'incomplete'. + if( clean.tokens.length ){ + + // if all the tokens are complete, simply copy them from clean.tokens + if( inputParserRanSuccessfully ){ + + // all these tokens are complete! + clean.tokens_complete = clean.tokens.slice(); + + // user hasn't finished typing yet + } else { + + // make a copy of the tokens and remove the last element + var tokensCopy = clean.tokens.slice(), + lastToken = tokensCopy.pop(); + + // set all but the last token as 'complete' + clean.tokens_complete = tokensCopy; + + /** + if the last token is a single non-numeric character then we must discard it. + + at time of writing, single non-numeric ngrams are not stored in the index, + sending them as part of the query would result in 0 documents being returned. + **/ + if( lastToken && ( lastToken.length > 1 || lastToken.match(/[0-9]/) ) ){ + clean.tokens_incomplete = [ lastToken ]; + } + } + + } + + return messages; +} + +// export function +module.exports = sanitize; diff --git a/sanitiser/autocomplete.js b/sanitiser/autocomplete.js index f9698956..8ab6fd9c 100644 --- a/sanitiser/autocomplete.js +++ b/sanitiser/autocomplete.js @@ -4,6 +4,7 @@ var sanitizeAll = require('../sanitiser/sanitizeAll'), sanitizers = { singleScalarParameters: require('../sanitiser/_single_scalar_parameters'), text: require('../sanitiser/_text'), + tokenizer: require('../sanitiser/_tokenizer'), size: require('../sanitiser/_size')(10, 10, 10), layers: require('../sanitiser/_targets')('layers', type_mapping.layer_mapping), sources: require('../sanitiser/_targets')('sources', type_mapping.source_mapping), diff --git a/test/ciao/autocomplete/layers_alias_coarse.coffee b/test/ciao/autocomplete/layers_alias_coarse.coffee index 2fa2265c..3db308be 100644 --- a/test/ciao/autocomplete/layers_alias_coarse.coffee +++ b/test/ciao/autocomplete/layers_alias_coarse.coffee @@ -41,6 +41,7 @@ json.geocoding.query.layers.should.eql [ "continent", "macrocounty", "county", "macrohood", + "borough", "neighbourhood", "microhood", "disputed" diff --git a/test/ciao/autocomplete/layers_invalid.coffee b/test/ciao/autocomplete/layers_invalid.coffee index 620b5586..6f3cebe0 100644 --- a/test/ciao/autocomplete/layers_invalid.coffee +++ b/test/ciao/autocomplete/layers_invalid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/ciao/autocomplete/layers_mix_invalid_valid.coffee b/test/ciao/autocomplete/layers_mix_invalid_valid.coffee index 963b79ab..a819dd44 100644 --- a/test/ciao/autocomplete/layers_mix_invalid_valid.coffee +++ b/test/ciao/autocomplete/layers_mix_invalid_valid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/ciao/reverse/layers_alias_coarse.coffee b/test/ciao/reverse/layers_alias_coarse.coffee index 09c91483..40ce2e37 100644 --- a/test/ciao/reverse/layers_alias_coarse.coffee +++ b/test/ciao/reverse/layers_alias_coarse.coffee @@ -40,6 +40,7 @@ json.geocoding.query.layers.should.eql [ "continent", "macrocounty", "county", "macrohood", + "borough", "neighbourhood", "microhood", "disputed" diff --git a/test/ciao/reverse/layers_invalid.coffee b/test/ciao/reverse/layers_invalid.coffee index aaec4864..bc57a3b3 100644 --- a/test/ciao/reverse/layers_invalid.coffee +++ b/test/ciao/reverse/layers_invalid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/ciao/reverse/layers_mix_invalid_valid.coffee b/test/ciao/reverse/layers_mix_invalid_valid.coffee index 307b225d..16f40b9e 100644 --- a/test/ciao/reverse/layers_mix_invalid_valid.coffee +++ b/test/ciao/reverse/layers_mix_invalid_valid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/ciao/search/layers_alias_coarse.coffee b/test/ciao/search/layers_alias_coarse.coffee index bf7cdb52..48723853 100644 --- a/test/ciao/search/layers_alias_coarse.coffee +++ b/test/ciao/search/layers_alias_coarse.coffee @@ -41,6 +41,7 @@ json.geocoding.query.layers.should.eql [ "continent", "macrocounty", "county", "macrohood", + "borough", "neighbourhood", "microhood", "disputed" diff --git a/test/ciao/search/layers_invalid.coffee b/test/ciao/search/layers_invalid.coffee index 4f2da456..cc6feab8 100644 --- a/test/ciao/search/layers_invalid.coffee +++ b/test/ciao/search/layers_invalid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/ciao/search/layers_mix_invalid_valid.coffee b/test/ciao/search/layers_mix_invalid_valid.coffee index f004c69e..f332e051 100644 --- a/test/ciao/search/layers_mix_invalid_valid.coffee +++ b/test/ciao/search/layers_mix_invalid_valid.coffee @@ -24,7 +24,7 @@ json.features.should.be.instanceof Array #? expected errors should.exist json.geocoding.errors -json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,macroregion,region,county,locality,continent,macrocounty,dependency,localadmin,macrohood,neighbourhood,microhood,disputed' ] +json.geocoding.errors.should.eql [ '\'notlayer\' is an invalid layers parameter. Valid options: coarse,address,venue,country,region,county,locality,continent,dependency,macroregion,localadmin,macrocounty,macrohood,borough,neighbourhood,microhood,disputed' ] #? expected warnings should.not.exist json.geocoding.warnings diff --git a/test/unit/fixture/autocomplete_linguistic_final_token.js b/test/unit/fixture/autocomplete_linguistic_final_token.js index fbe80052..b4cc33d2 100644 --- a/test/unit/fixture/autocomplete_linguistic_final_token.js +++ b/test/unit/fixture/autocomplete_linguistic_final_token.js @@ -7,24 +7,31 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', + 'boost': 1, + 'slop': 3, 'query': 'one', - 'type': 'phrase', - 'operator': 'and' + 'type': 'phrase' } } }], 'should':[{ + 'match': { + 'phrase.default': { + 'analyzer': 'peliasPhrase', + 'boost': 1, + 'slop': 3, + 'query': 'one', + 'type': 'phrase' + } + } + },{ 'function_score': { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +52,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_focus.js b/test/unit/fixture/autocomplete_linguistic_focus.js index 4f722b84..430d43c9 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus.js +++ b/test/unit/fixture/autocomplete_linguistic_focus.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -40,7 +40,7 @@ module.exports = { 'decay': 0.5 } }, - 'weight': 10 + 'weight': 40 }], 'score_mode': 'avg', 'boost_mode': 'multiply', @@ -64,11 +64,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -89,11 +86,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js index d9c04fd1..9a4afc05 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js +++ b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -40,7 +40,7 @@ module.exports = { 'decay': 0.5 } }, - 'weight': 10 + 'weight': 40 }], 'score_mode': 'avg', 'boost_mode': 'multiply', @@ -64,11 +64,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -89,11 +86,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index 9018fdab..d0465b04 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -6,11 +6,11 @@ module.exports = { 'bool': { 'must': [{ 'match': { - 'phrase.default': { - 'analyzer': 'peliasPhrase', + 'name.default': { + 'analyzer': 'peliasQueryFullToken', 'type': 'phrase', 'boost': 1, - 'slop': 2, + 'slop': 3, 'query': 'one two' } } @@ -18,7 +18,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'three', 'type': 'phrase', @@ -26,16 +26,25 @@ module.exports = { } } }], - 'should':[{ + 'should':[ + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'one two' + } + } + }, + { 'function_score': { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two three', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -56,11 +65,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two three', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_only.js b/test/unit/fixture/autocomplete_linguistic_only.js index 24b89ad9..4360f4d7 100644 --- a/test/unit/fixture/autocomplete_linguistic_only.js +++ b/test/unit/fixture/autocomplete_linguistic_only.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,11 +20,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +42,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 245f6451..7cb51eea 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -7,11 +7,11 @@ module.exports = { 'must': [ { 'match': { - 'phrase.default': { - 'analyzer': 'peliasPhrase', + 'name.default': { + 'analyzer': 'peliasQueryFullToken', 'type': 'phrase', 'boost': 1, - 'slop': 2, + 'slop': 3, 'query': 'one two' } } @@ -54,6 +54,15 @@ module.exports = { } } }, + { + 'match': { + 'parent.borough': { + 'analyzer': 'peliasAdmin', + 'boost': 600, + 'query': 'three' + } + } + }, { 'match': { 'parent.localadmin': { @@ -81,16 +90,24 @@ module.exports = { } } }, + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'one two' + } + } + }, { 'function_score': { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -114,11 +131,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js new file mode 100644 index 00000000..f89e8493 --- /dev/null +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -0,0 +1,155 @@ + +module.exports = { + 'query': { + 'filtered': { + 'query': { + 'bool': { + 'must': [{ + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'type': 'phrase', + 'boost': 1, + 'slop': 3, + 'query': 'k road' + } + } + }], + 'should':[ + { + 'match': { + 'address_parts.street': { + 'query': 'k road', + 'boost': 5, + 'analyzer': 'peliasStreet' + } + } + }, { + 'match': { + 'parent.country': { + 'query': 'laird', + 'boost': 800, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.region': { + 'query': 'laird', + 'boost': 600, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.region_a': { + 'query': 'laird', + 'boost': 600, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.county': { + 'query': 'laird', + 'boost': 400, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.borough': { + 'analyzer': 'peliasAdmin', + 'boost': 600, + 'query': 'laird' + } + } + }, { + 'match': { + 'parent.localadmin': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.locality': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.neighbourhood': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'k road' + } + } + }, + { + 'function_score': { + 'query': { + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'k road', + } + } + }, + 'max_boost': 20, + 'score_mode': 'first', + 'boost_mode': 'replace', + 'functions': [{ + 'field_value_factor': { + 'modifier': 'log1p', + 'field': 'popularity', + 'missing': 1 + }, + 'weight': 1 + }] + } + },{ + 'function_score': { + 'query': { + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'k road', + } + } + }, + 'max_boost': 20, + 'score_mode': 'first', + 'boost_mode': 'replace', + 'functions': [{ + 'field_value_factor': { + 'modifier': 'log1p', + 'field': 'population', + 'missing': 1 + }, + 'weight': 3 + }] + } + }] + } + } + } + }, + 'sort': [ '_score' ], + 'size': 20, + 'track_scores': true +}; diff --git a/test/unit/fixture/autocomplete_with_source_filtering.js b/test/unit/fixture/autocomplete_with_source_filtering.js index 22c12a5d..075eb6d4 100644 --- a/test/unit/fixture/autocomplete_with_source_filtering.js +++ b/test/unit/fixture/autocomplete_with_source_filtering.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,11 +20,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +42,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/search_boundary_country.js b/test/unit/fixture/search_boundary_country.js index 4bf45315..71965df4 100644 --- a/test/unit/fixture/search_boundary_country.js +++ b/test/unit/fixture/search_boundary_country.js @@ -18,7 +18,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } } diff --git a/test/unit/fixture/search_full_address.js b/test/unit/fixture/search_full_address.js index 8a8290ab..dfd64e34 100644 --- a/test/unit/fixture/search_full_address.js +++ b/test/unit/fixture/search_full_address.js @@ -9,7 +9,7 @@ module.exports = { 'match': { 'name.default': { 'query': '123 main st', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } @@ -139,6 +139,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'new york', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_linguistic_bbox.js b/test/unit/fixture/search_linguistic_bbox.js index 5bb5907c..6afe7be6 100644 --- a/test/unit/fixture/search_linguistic_bbox.js +++ b/test/unit/fixture/search_linguistic_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus.js b/test/unit/fixture/search_linguistic_focus.js index 5d03d66d..da3e8fb3 100644 --- a/test/unit/fixture/search_linguistic_focus.js +++ b/test/unit/fixture/search_linguistic_focus.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_bbox.js b/test/unit/fixture/search_linguistic_focus_bbox.js index 96fe92f6..d5042c0f 100644 --- a/test/unit/fixture/search_linguistic_focus_bbox.js +++ b/test/unit/fixture/search_linguistic_focus_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_null_island.js b/test/unit/fixture/search_linguistic_focus_null_island.js index 0924475d..b99febea 100644 --- a/test/unit/fixture/search_linguistic_focus_null_island.js +++ b/test/unit/fixture/search_linguistic_focus_null_island.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_only.js b/test/unit/fixture/search_linguistic_only.js index 58c05826..a564a4c1 100644 --- a/test/unit/fixture/search_linguistic_only.js +++ b/test/unit/fixture/search_linguistic_only.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_viewport.js b/test/unit/fixture/search_linguistic_viewport.js index be76ab05..b85d8322 100644 --- a/test/unit/fixture/search_linguistic_viewport.js +++ b/test/unit/fixture/search_linguistic_viewport.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js index cf44d0d8..e6b50ac6 100644 --- a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js +++ b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_partial_address.js b/test/unit/fixture/search_partial_address.js index 6c4174b6..746899b7 100644 --- a/test/unit/fixture/search_partial_address.js +++ b/test/unit/fixture/search_partial_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': 'soho grand', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } @@ -107,6 +107,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'new york', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_regions_address.js b/test/unit/fixture/search_regions_address.js index e0c05f3c..0a8b199d 100644 --- a/test/unit/fixture/search_regions_address.js +++ b/test/unit/fixture/search_regions_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': '1 water st', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } @@ -123,6 +123,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'manhattan', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_with_source_filtering.js b/test/unit/fixture/search_with_source_filtering.js index 593eac5b..18ee13a3 100644 --- a/test/unit/fixture/search_with_source_filtering.js +++ b/test/unit/fixture/search_with_source_filtering.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/helper/labelGenerator_examples.js b/test/unit/helper/labelGenerator_examples.js index 416b7598..3207df86 100644 --- a/test/unit/helper/labelGenerator_examples.js +++ b/test/unit/helper/labelGenerator_examples.js @@ -104,6 +104,17 @@ module.exports.tests.france = function(test, common) { }; +module.exports.tests.name_only = function(test, common) { + test('name-only results (no admin fields) should not include extraneous comma', function(t) { + var doc = { + 'name': 'Result name', + }; + t.equal(generator(doc),'Result name'); + t.end(); + }); + +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { diff --git a/test/unit/helper/text_parser.js b/test/unit/helper/text_parser.js deleted file mode 100644 index ca5b05f0..00000000 --- a/test/unit/helper/text_parser.js +++ /dev/null @@ -1,150 +0,0 @@ -var parser = require('../../../helper/text_parser'); - -var type_mapping = require('../../../helper/type_mapping'); -var layers_map = type_mapping.layer_mapping; - -module.exports.tests = {}; - -module.exports.tests.interface = function(test, common) { - test('interface', function(t) { - t.equal(typeof parser.get_parsed_address, 'function', 'valid function'); - t.equal(typeof parser.get_layers, 'function', 'valid function'); - t.end(); - }); -}; - -module.exports.tests.split_on_comma = function(test, common) { - var queries = [ - { name: 'soho', admin_parts: 'new york' }, - { name: 'chelsea', admin_parts: 'london' }, - { name: '123 main', admin_parts: 'new york' } - ]; - - queries.forEach(function (query) { - test('naive parsing ' + query, function(t) { - var address = parser.get_parsed_address(query.name + ', ' + query.admin_parts); - - t.equal(typeof address, 'object', 'valid object'); - t.equal(address.name, query.name, 'name set correctly to ' + address.name); - t.equal(address.admin_parts, query.admin_parts, 'admin_parts set correctly to ' + address.admin_parts); - t.end(); - }); - - test('naive parsing ' + query + 'without spaces', function(t) { - var address = parser.get_parsed_address(query.name + ',' + query.admin_parts); - - t.equal(typeof address, 'object', 'valid object'); - t.equal(address.name, query.name, 'name set correctly to ' + address.name); - t.equal(address.admin_parts, query.admin_parts, 'admin_parts set correctly to ' + address.admin_parts); - t.end(); - }); - }); -}; - -module.exports.tests.parse_three_chars_or_less = function(test, common) { - var chars_queries = ['a', 'bb', 'ccc']; - var num_queries = ['1', '12', '123']; - var alphanum_q = ['a1', '1a2', '12c']; - - var queries = chars_queries.concat(num_queries).concat(alphanum_q); - queries.forEach(function(query) { - test('query length < 3 (' + query + ')', function(t) { - var address = parser.get_parsed_address(query); - var target_layer = layers_map.coarse; - var layers = parser.get_layers(query); - - t.equal(typeof address, 'object', 'valid object'); - t.deepEqual(layers, target_layer, 'admin_parts set correctly to ' + target_layer.join(', ')); - t.end(); - }); - }); -}; - -module.exports.tests.parse_one_token = function(test, common) { - test('query with one token', function (t) { - var address = parser.get_parsed_address('yugolsavia'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with two tokens, no numbers', function (t) { - var address = parser.get_parsed_address('small town'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with two tokens, number first', function (t) { - var address = parser.get_parsed_address('123 main'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with two tokens, number second', function (t) { - var address = parser.get_parsed_address('main 123'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with many tokens', function(t) { - var address = parser.get_parsed_address('main particle new york'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); -}; - -module.exports.tests.parse_address = function(test, common) { - test('valid address, house number', function(t) { - var query_string = '123 main st new york ny'; - var address = parser.get_parsed_address(query_string); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.number, '123', 'parsed house number'); - t.equal(address.street, 'main st', 'parsed street'); - t.deepEqual(address.regions, ['new york'], 'parsed city'); - t.equal(address.state , 'NY', 'parsed state'); - t.end(); - }); - test('valid address, zipcode', function(t) { - var query_string = '123 main st new york ny 10010'; - var address = parser.get_parsed_address(query_string); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.number, '123', 'parsed house number'); - t.equal(address.street, 'main st', 'parsed street'); - t.deepEqual(address.regions, ['new york'], 'parsed city'); - t.equal(address.state , 'NY', 'parsed state'); - t.equal(address.postalcode, '10010', 'parsed zip is a string'); - t.end(); - }); - test('valid address with leading 0s in zipcode', function(t) { - var query_string = '339 W Main St, Cheshire, 06410'; - var address = parser.get_parsed_address(query_string); - - console.log(address); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.street, 'W Main St', 'parsed street'); - t.deepEqual(address.regions, ['Cheshire'], 'parsed city'); - t.equal(address.postalcode, '06410', 'parsed zip'); - t.end(); - }); - test('valid address without spaces after commas', function(t) { - var query_string = '339 W Main St,Lancaster,PA'; - var address = parser.get_parsed_address(query_string); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.number, '339', 'parsed house number'); - t.equal(address.street, 'W Main St', 'parsed street'); - t.deepEqual(address.regions, ['Lancaster'], 'parsed city'); - t.deepEqual(address.state, 'PA', 'parsed state'); - t.end(); - }); -}; - - -module.exports.all = function (tape, common) { - - function test(name, testFunction) { - return tape('QUERY PARSING: ' + name, testFunction); - } - - for( var testCase in module.exports.tests ){ - module.exports.tests[testCase](test, common); - } -}; diff --git a/test/unit/helper/type_mapping.js b/test/unit/helper/type_mapping.js index 355fd4e6..a9ec4721 100644 --- a/test/unit/helper/type_mapping.js +++ b/test/unit/helper/type_mapping.js @@ -14,7 +14,7 @@ module.exports.tests.interfaces = function(test, common) { t.deepEquals(type_mapping.layer_mapping.coarse, [ 'continent', 'country', 'dependency', 'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', - 'neighbourhood', 'microhood', 'disputed' ]); + 'borough', 'neighbourhood', 'microhood', 'disputed' ]); t.end(); }); diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index 0e09457b..bb368fc9 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -1,6 +1,5 @@ var generate = require('../../../query/autocomplete'); -var parser = require('../../../helper/text_parser'); module.exports.tests = {}; @@ -14,25 +13,31 @@ module.exports.tests.interface = function(test, common) { module.exports.tests.query = function(test, common) { test('valid lingustic-only autocomplete', function(t) { var query = generate({ - text: 'test' + text: 'test', + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_only'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_only'); t.end(); }); test('valid lingustic autocomplete with 3 tokens', function(t) { var query = generate({ - text: 'one two three' + text: 'one two three', + tokens: ['one','two','three'], + tokens_complete: ['one','two'], + tokens_incomplete: ['three'] }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_multiple_tokens.js'); + var expected = require('../fixture/autocomplete_linguistic_multiple_tokens'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_multiple_tokens'); t.end(); }); @@ -43,13 +48,16 @@ module.exports.tests.query = function(test, common) { name: 'one two', regions: [ 'one two', 'three' ], admin_parts: 'three' - } + }, + tokens: ['one','two'], + tokens_complete: ['one','two'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_with_admin.js'); + var expected = require('../fixture/autocomplete_linguistic_with_admin'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_with_admin'); t.end(); }); @@ -58,13 +66,16 @@ module.exports.tests.query = function(test, common) { // note: if 1 grams are enabled at a later date, remove this behaviour. test('valid lingustic autocomplete final token', function(t) { var query = generate({ - text: 'one t' + text: 'one t', + tokens: ['one','t'], + tokens_complete: ['one'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_final_token.js'); + var expected = require('../fixture/autocomplete_linguistic_final_token'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_final_token'); t.end(); }); @@ -72,13 +83,16 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'test', 'focus.point.lat': 29.49136, - 'focus.point.lon': -82.50622 + 'focus.point.lon': -82.50622, + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_focus'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_focus'); t.end(); }); @@ -86,20 +100,26 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'test', 'focus.point.lat': 0, - 'focus.point.lon': 0 + 'focus.point.lon': 0, + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_focus_null_island'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_focus_null_island'); t.end(); }); test('valid sources filter', function(t) { var query = generate({ 'text': 'test', - 'sources': ['test_source'] + 'sources': ['test_source'], + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -108,6 +128,26 @@ module.exports.tests.query = function(test, common) { t.deepEqual(compiled, expected, 'valid autocomplete query with source filtering'); t.end(); }); + + test('single character street address', function(t) { + var query = generate({ + text: 'k road, laird', + parsed_text: { + name: 'k road', + street: 'k road', + regions: [ 'laird' ] + }, + tokens: ['k', 'road'], + tokens_complete: ['k', 'road'], + tokens_incomplete: [] + }); + + var compiled = JSON.parse( JSON.stringify( query ) ); + var expected = require('../fixture/autocomplete_single_character_street'); + + t.deepEqual(compiled, expected, 'autocomplete_single_character_street'); + t.end(); + }); }; module.exports.all = function (tape, common) { diff --git a/test/unit/query/search.js b/test/unit/query/search.js index e503311b..a2bb8e2f 100644 --- a/test/unit/query/search.js +++ b/test/unit/query/search.js @@ -1,5 +1,5 @@ var generate = require('../../../query/search'); -var parser = require('../../../helper/text_parser'); +var text_analyzer = require('pelias-text-analyzer'); module.exports.tests = {}; @@ -25,7 +25,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus_bbox'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus_bbox'); t.end(); }); @@ -42,7 +42,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_bbox'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_bbox'); t.end(); }); @@ -55,7 +55,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_only'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_only'); t.end(); }); @@ -69,7 +69,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus'); t.end(); }); @@ -86,7 +86,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_viewport'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_viewport'); t.end(); }); @@ -119,7 +119,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus_null_island'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus_null_island'); t.end(); }); @@ -128,13 +128,13 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: address, layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, - parsed_text: parser.get_parsed_address(address), + parsed_text: text_analyzer.parse(address), }); var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_full_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_full_address'); t.end(); }); @@ -143,13 +143,13 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: partial_address, layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, - parsed_text: parser.get_parsed_address(partial_address), + parsed_text: text_analyzer.parse(partial_address), }); var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_partial_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_partial_address'); t.end(); }); @@ -158,13 +158,13 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: partial_address, layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, - parsed_text: parser.get_parsed_address(partial_address), + parsed_text: text_analyzer.parse(partial_address), }); var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_regions_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_regions_address'); t.end(); }); diff --git a/test/unit/run.js b/test/unit/run.js index 6f64f2e6..1a6f7a90 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -19,7 +19,6 @@ var tests = [ require('./helper/labelGenerator_GBR'), require('./helper/labelGenerator_USA'), require('./helper/labelSchema'), - require('./helper/text_parser'), require('./helper/type_mapping'), require('./helper/sizeCalculator'), require('./middleware/confidenceScore'), @@ -47,6 +46,7 @@ var tests = [ require('./sanitiser/_sources'), require('./sanitiser/_sources_and_layers'), require('./sanitiser/_text'), + require('./sanitiser/_tokenizer'), require('./sanitiser/_deprecate_quattroshapes'), require('./src/backend'), require('./sanitiser/autocomplete'), diff --git a/test/unit/sanitiser/_layers.js b/test/unit/sanitiser/_layers.js index b9dcbd0f..a1fde0f4 100644 --- a/test/unit/sanitiser/_layers.js +++ b/test/unit/sanitiser/_layers.js @@ -42,8 +42,8 @@ module.exports.tests.sanitize_layers = function(test, common) { sanitize(raw, clean); var admin_layers = [ 'continent', 'country', 'dependency', - 'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'neighbourhood', - 'microhood', 'disputed' ]; + 'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', + 'macrohood', 'borough', 'neighbourhood', 'microhood', 'disputed' ]; t.deepEqual(clean.layers, admin_layers, 'coarse layers set'); t.end(); @@ -77,8 +77,8 @@ module.exports.tests.sanitize_layers = function(test, common) { sanitize(raw, clean); var expected_layers = [ 'continent', 'country', 'dependency', - 'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', 'macrohood', 'neighbourhood', - 'microhood', 'disputed' ]; + 'macroregion', 'region', 'locality', 'localadmin', 'macrocounty', 'county', + 'macrohood', 'borough', 'neighbourhood', 'microhood', 'disputed' ]; t.deepEqual(clean.layers, expected_layers, 'coarse + regular layers set'); t.end(); @@ -114,7 +114,7 @@ module.exports.tests.sanitize_layers = function(test, common) { var coarse_layers = [ 'continent', 'country', 'dependency', 'macroregion', 'region', 'locality', 'localadmin', - 'macrocounty', 'county', 'macrohood', 'neighbourhood', 'microhood', + 'macrocounty', 'county', 'macrohood', 'borough', 'neighbourhood', 'microhood', 'disputed' ]; var venue_layers = [ 'venue' ]; diff --git a/test/unit/sanitiser/_tokenizer.js b/test/unit/sanitiser/_tokenizer.js new file mode 100644 index 00000000..8837d4ab --- /dev/null +++ b/test/unit/sanitiser/_tokenizer.js @@ -0,0 +1,457 @@ +var sanitiser = require('../../../sanitiser/_tokenizer'); + +module.exports.tests = {}; + +module.exports.tests.sanity_checks = function(test, common) { + test('clean.text not set', function(t) { + + var clean = {}; // clean.text not set + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('clean.text not a string', function(t) { + + var clean = { text: {} }; // clean.text not a string + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('empty string', function(t) { + + var clean = { text: '' }; + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) { + + var clean = { parsed_text: { text: {} } }; + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text.name over clean.text', function(t) { + + var clean = { parsed_text: { name: 'foo' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over clean.text + t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text street data over clean.text', function(t) { + + var clean = { parsed_text: { number: '190', street: 'foo st' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over clean.text + t.deepEquals(clean.tokens, [ '190', 'foo', 'st' ], 'use street name + number'); + t.deepEquals(clean.tokens_complete, [ '190', 'foo', 'st' ], 'use street name + number'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text.name over clean.parsed_text street data', function(t) { + + var clean = { parsed_text: { number: '190', street: 'foo st', name: 'foo' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over all other variables + t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.space_delimiter = function(test, common) { + test('space delimiter - simple', function(t) { + + var clean = { text: '30 west 26th street new york' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('space delimiter - multiple spaces / other whitespace', function(t) { + + var clean = { text: ' 30 west \t26th \nstreet new york ' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.comma_delimiter = function(test, common) { + test('comma delimiter - simple', function(t) { + + var clean = { text: '30 west 26th street, new york' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('comma delimiter - multiple commas', function(t) { + + var clean = { text: ',30 west 26th street,,, new york,' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.forward_slash_delimiter = function(test, common) { + test('forward slash delimiter - simple', function(t) { + + var clean = { text: 'Bedell Street/133rd Avenue' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'Bedell', + 'Street', + '133rd' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('forward slash - multiple slashes', function(t) { + + var clean = { text: '/Bedell Street//133rd Avenue/' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'Bedell', + 'Street', + '133rd' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.final_token_single_gram = function(test, common) { + test('final token single gram - numeric', function(t) { + + var clean = { text: 'grolmanstrasse 1' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'grolmanstrasse', + '1' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'grolmanstrasse', + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + '1' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('final token single gram - non-numeric', function(t) { + + var clean = { text: 'grolmanstrasse a' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'grolmanstrasse', + 'a' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'grolmanstrasse', + ], 'tokens produced'); + + // last token removed! + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.back_slash_delimiter = function(test, common) { + test('back slash delimiter - simple', function(t) { + + var clean = { text: 'Bedell Street\\133rd Avenue' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('back slash - multiple slashes', function(t) { + + var clean = { text: '\\Bedell Street\\\\133rd Avenue\\' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.mixed_delimiter = function(test, common) { + test('mixed delimiters', function(t) { + + var clean = { text: ',/Bedell Street\\, \n\t ,\\//133rd Avenue, /\n/' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.all = function (tape, common) { + function test(name, testFunction) { + return tape('SANITISER _tokenizer: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; diff --git a/test/unit/sanitiser/autocomplete.js b/test/unit/sanitiser/autocomplete.js index 26bf9afb..186cb4b6 100644 --- a/test/unit/sanitiser/autocomplete.js +++ b/test/unit/sanitiser/autocomplete.js @@ -4,7 +4,10 @@ module.exports.tests = {}; module.exports.tests.sanitisers = function(test, common) { test('check sanitiser list', function (t) { - var expected = ['singleScalarParameters', 'text', 'size', 'layers', 'sources', 'sources_and_layers', 'private', 'geo_autocomplete' ]; + var expected = [ + 'singleScalarParameters', 'text', 'tokenizer', 'size', 'layers', 'sources', + 'sources_and_layers', 'private', 'geo_autocomplete' + ]; t.deepEqual(Object.keys(autocomplete.sanitiser_list), expected); t.end(); }); diff --git a/test/unit/sanitiser/search.js b/test/unit/sanitiser/search.js index e09672ce..864195a8 100644 --- a/test/unit/sanitiser/search.js +++ b/test/unit/sanitiser/search.js @@ -1,6 +1,6 @@ var extend = require('extend'), search = require('../../../sanitiser/search'), - parser = require('../../../helper/text_parser'), + text_analyzer = require('pelias-text-analyzer'), sanitize = search.sanitize, middleware = search.middleware, defaultError = 'invalid param \'text\': text length, must be >0'; @@ -80,7 +80,7 @@ module.exports.tests.sanitize_text_with_delim = function(test, common) { sanitize( req, function( ){ var expected_text = text; - var expected_parsed_text = parser.get_parsed_address(text); + var expected_parsed_text = text_analyzer.parse(text); t.equal(req.errors[0], undefined, 'no error'); t.equal(req.clean.parsed_text.name, expected_parsed_text.name, 'clean name set correctly'); t.equal(req.clean.text, expected_text, 'text should match');