diff --git a/query/autocomplete.js b/query/autocomplete.js index 6da4d569..50f6da29 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -63,31 +63,24 @@ function generateQuery( clean ){ vs.var( 'sources', clean.sources ); } - // mark the name as incomplete (user has not yet typed a comma) - vs.var( 'input:name:isComplete', false ); - - // perform some operations on 'clean.text': - // 1. if there is a space followed by a single char, remove them. - // - this is required as the index uses 2grams and sending 1grams - // - to a 2gram index when using 'type:phrase' or 'operator:and' will - // - result in a complete failure of the query. - // 2. trim leading and trailing whitespace. - // note: single digit grams are now being produced in the name.* index - var text = clean.text.replace(/( [^0-9]$)/g,'').trim(); + // pass the input tokens to the views so they can choose which tokens + // are relevant for their specific function. + if( check.array( clean.tokens ) ){ + vs.var( 'input:name:tokens', clean.tokens ); + vs.var( 'input:name:tokens_complete', clean.tokens_complete ); + vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete ); + } + + // input text + vs.var( 'input:name', clean.text ); // if the input parser has run and suggested a 'parsed_text.name' to use. if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ - // mark the name as complete (user has already typed a comma) - vs.var( 'input:name:isComplete', true ); - // use 'parsed_text.name' instead of 'clean.text'. - text = clean.parsed_text.name; + vs.var( 'input:name', clean.parsed_text.name ); } - // input text - vs.var( 'input:name', text ); - // focus point if( check.number(clean['focus.point.lat']) && check.number(clean['focus.point.lon']) ){ diff --git a/query/view/boost_exact_matches.js b/query/view/boost_exact_matches.js index 8cf575f4..9af56cfb 100644 --- a/query/view/boost_exact_matches.js +++ b/query/view/boost_exact_matches.js @@ -27,19 +27,11 @@ module.exports = function( vs ){ vsCopy.var('phrase:analyzer').set(searchDefaults['phrase:analyzer']); vsCopy.var('phrase:field').set(searchDefaults['phrase:field']); - // split the 'input:name' on whitespace - var name = vs.var('input:name').get(), - tokens = name.split(' '); - - // if the query is incomplete then we need to remove - // the final (incomplete) token as it will not match - // tokens in the phrase.* index. - if( !vs.var('input:name:isComplete').get() ){ - tokens.pop(); - } + // get a copy of the *complete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_complete').get(); // no valid tokens to use, fail now, don't render this view. - if( tokens.length < 1 ){ return null; } + if( !tokens || tokens.length < 1 ){ return null; } // set 'input:name' to be only the fully completed characters vsCopy.var('input:name').set( tokens.join(' ') ); diff --git a/query/view/ngrams_last_token_only.js b/query/view/ngrams_last_token_only.js index 3e3315f7..2665c294 100644 --- a/query/view/ngrams_last_token_only.js +++ b/query/view/ngrams_last_token_only.js @@ -8,9 +8,6 @@ var peliasQuery = require('pelias-query'), eg. if the input was "100 foo str", then 'input:name' would only be 'str' note: it is assumed that the rest of the input is matched using another view. - there is an additional flag 'input:name:isComplete' used to disable this view - selectively, see that section for more info. - code notes: this view makes a copy of the $vs object in order to change their values without mutating the original values, which may be expected in their unaltered form by other views. @@ -18,19 +15,17 @@ var peliasQuery = require('pelias-query'), module.exports = function( vs ){ - // Totally disable this view when bool value 'input:name:isComplete' is true. - // This is the case when the user has typed a comma, so we can assume - // that the 'name' part of the query is now complete. - if( vs.var('input:name:isComplete').get() ){ return null; } + // get a copy of the *tokens_incomplete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_incomplete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } // make a copy Vars so we don't mutate the original var vsCopy = new peliasQuery.Vars( vs.export() ); - // get the input 'name' variable - var name = vs.var('input:name').get(); - // set the 'name' variable in the copy to only the last token - vsCopy.var('input:name').set( name.substr( name.lastIndexOf(' ')+1 ) ); + vsCopy.var('input:name').set( tokens.join(' ') ); // return the view rendered using the copy return ngrams_strict( vsCopy ); diff --git a/query/view/phrase_first_tokens_only.js b/query/view/phrase_first_tokens_only.js index b047b30f..7ab4539b 100644 --- a/query/view/phrase_first_tokens_only.js +++ b/query/view/phrase_first_tokens_only.js @@ -7,9 +7,6 @@ var peliasQuery = require('pelias-query'); eg. if the input was "100 foo str", then 'input:name' would only be '100 foo' note: it is assumed that the rest of the input is matched using another view. - there is an additional flag 'input:name:isComplete' used to disable this view - selectively, see that section for more info. - code notes: this view makes a copy of the $vs object in order to change their values without mutating the original values, which may be expected in their unaltered form by other views. @@ -17,27 +14,17 @@ var peliasQuery = require('pelias-query'); module.exports = function( vs ){ - // Don't mutate the name variable when 'input:name:isComplete' is true. - // This is the case when the user has typed a comma, so we can assume - // that the 'name' part of the query is now complete. - if( vs.var('input:name:isComplete').get() ){ - // return the view rendered using the original vars - return peliasQuery.view.phrase( vs ); - } + // get a copy of the *complete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_complete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } // make a copy Vars so we don't mutate the original var vsCopy = new peliasQuery.Vars( vs.export() ); - // get the input 'name' variable and split in to tokens - var name = vs.var('input:name').get(), - tokens = name.split(' '); - - // single token only, abort (we don't want the *last* token) - // return null here will completely disable the view. - if( tokens.length < 2 ){ return null; } - // set the 'name' variable in the copy to all but the last token - vsCopy.var('input:name').set( name.substr( 0, name.lastIndexOf(' ') ) ); + vsCopy.var('input:name').set( tokens.join(' ') ); // return the view rendered using the copy return peliasQuery.view.phrase( vsCopy ); diff --git a/query/view/pop_subquery.js b/query/view/pop_subquery.js index d18b9963..f29191fc 100644 --- a/query/view/pop_subquery.js +++ b/query/view/pop_subquery.js @@ -1,5 +1,6 @@ -var peliasQuery = require('pelias-query'); +var peliasQuery = require('pelias-query'), + check = require('check-types'); /** Population / Popularity subquery @@ -12,5 +13,21 @@ module.exports = function( vs ){ view.match['name.default'].analyzer = vs.var('phrase:analyzer'); delete view.match['name.default'].boost; + // only use complete tokens against the phase index (where possible). + var completeTokens = vs.var('input:name:tokens_complete').get(), + incompleteTokens = vs.var('input:name:tokens_incomplete').get(); + + // if the tokenizer has run (autocomplete only) then we will combine the + // 'complete' tokens with the 'incomplete' tokens, the resuting array differs + // slightly from the 'input:name:tokens' array as some tokens might have been + // removed in the process; such as single grams which are not present in then + // ngrams index. + if( check.array( completeTokens ) && check.array( incompleteTokens ) ){ + var combined = completeTokens.concat( incompleteTokens ); + if( combined.length ){ + view.match['name.default'].query = combined.join(' '); + } + } + return view; }; diff --git a/sanitiser/_tokenizer.js b/sanitiser/_tokenizer.js new file mode 100644 index 00000000..7b8e234c --- /dev/null +++ b/sanitiser/_tokenizer.js @@ -0,0 +1,95 @@ + +var check = require('check-types'); + +/** + simplified version of the elaticsearch tokenizer, used in order to + be able to detect which tokens are 'complete' (user has finished typing them) + or 'incomplete' (the user has possibly only typed part of the token). + + note: we don't need to strip punctuation as that will be handled on the + elasticsearch side, so sending a token such as 'st.' is not an issue, these + tokens should *not* be modified as the anaylsis can use the punctuation to + infer meaning. + + note: this sanitizer should run *after* the '_text' sanitizer so it can + use the output of clean.parsed_text where available. +**/ +function sanitize( raw, clean ){ + + // error & warning messages + var messages = { errors: [], warnings: [] }; + + // this is the string we will use for analysis + var text = clean.text; + + // a boolean to track whether the input parser successfully ran; or not. + var inputParserRanSuccessfully = false; + + // if the text parser has run then we only tokenize the 'name' section + // of the 'parsed_text' object, ignoring the 'admin' parts. + if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ + inputParserRanSuccessfully = true; + text = clean.parsed_text.name; // use this string instead + } + + // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors. + clean.tokens = []; + clean.tokens_complete = []; + clean.tokens_incomplete = []; + + // sanity check that the text is valid. + if( check.nonEmptyString( text ) ){ + + // split according to the regex used in the elasticsearch tokenizer + // see: https://github.com/pelias/schema/blob/master/settings.js + // see: settings.analysis.tokenizer.peliasNameTokenizer + clean.tokens = text + .split(/[\s,\\\/]+/) // split on delimeters + .filter(function(el){return el;}); // remove empty elements + } + + /** + the following section splits the tokens in to two arrays called + 'tokens_complete' and 'tokens_incomplete'. + + it also strips any tokens from 'tokens_incomplete' which might not + match the ngrams index (such as single grams not stored in the index). + **/ + + // split the tokens in to 'complete' and 'incomplete'. + if( clean.tokens.length ){ + + // if all the tokens are complete, simply copy them from clean.tokens + if( inputParserRanSuccessfully ){ + + // all these tokens are complete! + clean.tokens_complete = clean.tokens.slice(); + + // user hasn't finished typing yet + } else { + + // make a copy of the tokens and remove the last element + var tokensCopy = clean.tokens.slice(), + lastToken = tokensCopy.pop(); + + // set all but the last token as 'complete' + clean.tokens_complete = tokensCopy; + + /** + if the last token is a single non-numeric character then we must discard it. + + at time of writing, single non-numeric ngrams are not stored in the index, + sending them as part of the query would result in 0 documents being returned. + **/ + if( lastToken && ( lastToken.length > 1 || lastToken.match(/[0-9]/) ) ){ + clean.tokens_incomplete = [ lastToken ]; + } + } + + } + + return messages; +} + +// export function +module.exports = sanitize; diff --git a/sanitiser/autocomplete.js b/sanitiser/autocomplete.js index f9698956..8ab6fd9c 100644 --- a/sanitiser/autocomplete.js +++ b/sanitiser/autocomplete.js @@ -4,6 +4,7 @@ var sanitizeAll = require('../sanitiser/sanitizeAll'), sanitizers = { singleScalarParameters: require('../sanitiser/_single_scalar_parameters'), text: require('../sanitiser/_text'), + tokenizer: require('../sanitiser/_tokenizer'), size: require('../sanitiser/_size')(10, 10, 10), layers: require('../sanitiser/_targets')('layers', type_mapping.layer_mapping), sources: require('../sanitiser/_targets')('sources', type_mapping.source_mapping), diff --git a/test/unit/fixture/autocomplete_linguistic_final_token.js b/test/unit/fixture/autocomplete_linguistic_final_token.js index fc431c77..b4cc33d2 100644 --- a/test/unit/fixture/autocomplete_linguistic_final_token.js +++ b/test/unit/fixture/autocomplete_linguistic_final_token.js @@ -7,15 +7,25 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', + 'boost': 1, + 'slop': 3, 'query': 'one', - 'type': 'phrase', - 'operator': 'and' + 'type': 'phrase' } } }], 'should':[{ + 'match': { + 'phrase.default': { + 'analyzer': 'peliasPhrase', + 'boost': 1, + 'slop': 3, + 'query': 'one', + 'type': 'phrase' + } + } + },{ 'function_score': { 'query': { 'match': { diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index 72cfb5f2..bb368fc9 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -13,7 +13,10 @@ module.exports.tests.interface = function(test, common) { module.exports.tests.query = function(test, common) { test('valid lingustic-only autocomplete', function(t) { var query = generate({ - text: 'test' + text: 'test', + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -25,7 +28,10 @@ module.exports.tests.query = function(test, common) { test('valid lingustic autocomplete with 3 tokens', function(t) { var query = generate({ - text: 'one two three' + text: 'one two three', + tokens: ['one','two','three'], + tokens_complete: ['one','two'], + tokens_incomplete: ['three'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -42,7 +48,10 @@ module.exports.tests.query = function(test, common) { name: 'one two', regions: [ 'one two', 'three' ], admin_parts: 'three' - } + }, + tokens: ['one','two'], + tokens_complete: ['one','two'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -57,7 +66,10 @@ module.exports.tests.query = function(test, common) { // note: if 1 grams are enabled at a later date, remove this behaviour. test('valid lingustic autocomplete final token', function(t) { var query = generate({ - text: 'one t' + text: 'one t', + tokens: ['one','t'], + tokens_complete: ['one'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -71,7 +83,10 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'test', 'focus.point.lat': 29.49136, - 'focus.point.lon': -82.50622 + 'focus.point.lon': -82.50622, + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -85,7 +100,10 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'test', 'focus.point.lat': 0, - 'focus.point.lon': 0 + 'focus.point.lon': 0, + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -98,7 +116,10 @@ module.exports.tests.query = function(test, common) { test('valid sources filter', function(t) { var query = generate({ 'text': 'test', - 'sources': ['test_source'] + 'sources': ['test_source'], + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -115,7 +136,10 @@ module.exports.tests.query = function(test, common) { name: 'k road', street: 'k road', regions: [ 'laird' ] - } + }, + tokens: ['k', 'road'], + tokens_complete: ['k', 'road'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); diff --git a/test/unit/run.js b/test/unit/run.js index 94d9ebb3..1a6f7a90 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -46,6 +46,7 @@ var tests = [ require('./sanitiser/_sources'), require('./sanitiser/_sources_and_layers'), require('./sanitiser/_text'), + require('./sanitiser/_tokenizer'), require('./sanitiser/_deprecate_quattroshapes'), require('./src/backend'), require('./sanitiser/autocomplete'), diff --git a/test/unit/sanitiser/_tokenizer.js b/test/unit/sanitiser/_tokenizer.js new file mode 100644 index 00000000..a7c6ced4 --- /dev/null +++ b/test/unit/sanitiser/_tokenizer.js @@ -0,0 +1,425 @@ +var sanitiser = require('../../../sanitiser/_tokenizer'); + +module.exports.tests = {}; + +module.exports.tests.sanity_checks = function(test, common) { + test('clean.text not set', function(t) { + + var clean = {}; // clean.text not set + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('clean.text not a string', function(t) { + + var clean = { text: {} }; // clean.text not a string + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('empty string', function(t) { + + var clean = { text: '' }; + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) { + + var clean = { parsed_text: { text: {} } }; + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text.name over clean.text', function(t) { + + var clean = { parsed_text: { name: 'foo' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over clean.text + t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.space_delimiter = function(test, common) { + test('space delimiter - simple', function(t) { + + var clean = { text: '30 west 26th street new york' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('space delimiter - multiple spaces / other whitespace', function(t) { + + var clean = { text: ' 30 west \t26th \nstreet new york ' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.comma_delimiter = function(test, common) { + test('comma delimiter - simple', function(t) { + + var clean = { text: '30 west 26th street, new york' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('comma delimiter - multiple commas', function(t) { + + var clean = { text: ',30 west 26th street,,, new york,' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.forward_slash_delimiter = function(test, common) { + test('forward slash delimiter - simple', function(t) { + + var clean = { text: 'Bedell Street/133rd Avenue' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'Bedell', + 'Street', + '133rd' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('forward slash - multiple slashes', function(t) { + + var clean = { text: '/Bedell Street//133rd Avenue/' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'Bedell', + 'Street', + '133rd' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.final_token_single_gram = function(test, common) { + test('final token single gram - numeric', function(t) { + + var clean = { text: 'grolmanstrasse 1' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'grolmanstrasse', + '1' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'grolmanstrasse', + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + '1' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('final token single gram - non-numeric', function(t) { + + var clean = { text: 'grolmanstrasse a' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'grolmanstrasse', + 'a' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'grolmanstrasse', + ], 'tokens produced'); + + // last token removed! + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.back_slash_delimiter = function(test, common) { + test('back slash delimiter - simple', function(t) { + + var clean = { text: 'Bedell Street\\133rd Avenue' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('back slash - multiple slashes', function(t) { + + var clean = { text: '\\Bedell Street\\\\133rd Avenue\\' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.mixed_delimiter = function(test, common) { + test('mixed delimiters', function(t) { + + var clean = { text: ',/Bedell Street\\, \n\t ,\\//133rd Avenue, /\n/' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.all = function (tape, common) { + function test(name, testFunction) { + return tape('SANITISER _tokenizer: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; diff --git a/test/unit/sanitiser/autocomplete.js b/test/unit/sanitiser/autocomplete.js index 26bf9afb..186cb4b6 100644 --- a/test/unit/sanitiser/autocomplete.js +++ b/test/unit/sanitiser/autocomplete.js @@ -4,7 +4,10 @@ module.exports.tests = {}; module.exports.tests.sanitisers = function(test, common) { test('check sanitiser list', function (t) { - var expected = ['singleScalarParameters', 'text', 'size', 'layers', 'sources', 'sources_and_layers', 'private', 'geo_autocomplete' ]; + var expected = [ + 'singleScalarParameters', 'text', 'tokenizer', 'size', 'layers', 'sources', + 'sources_and_layers', 'private', 'geo_autocomplete' + ]; t.deepEqual(Object.keys(autocomplete.sanitiser_list), expected); t.end(); });