Browse Source

Merge pull request #529 from pelias/missinglink_complete_incomplete_refactor

improve api-side tokenizer
pull/526/head
Peter Johnson a.k.a. insertcoffee 9 years ago
parent
commit
e6d9a0c034
  1. 29
      query/autocomplete.js
  2. 14
      query/view/boost_exact_matches.js
  3. 17
      query/view/ngrams_last_token_only.js
  4. 25
      query/view/phrase_first_tokens_only.js
  5. 19
      query/view/pop_subquery.js
  6. 95
      sanitiser/_tokenizer.js
  7. 1
      sanitiser/autocomplete.js
  8. 18
      test/unit/fixture/autocomplete_linguistic_final_token.js
  9. 40
      test/unit/query/autocomplete.js
  10. 1
      test/unit/run.js
  11. 425
      test/unit/sanitiser/_tokenizer.js
  12. 5
      test/unit/sanitiser/autocomplete.js

29
query/autocomplete.js

@ -63,31 +63,24 @@ function generateQuery( clean ){
vs.var( 'sources', clean.sources ); vs.var( 'sources', clean.sources );
} }
// mark the name as incomplete (user has not yet typed a comma) // pass the input tokens to the views so they can choose which tokens
vs.var( 'input:name:isComplete', false ); // are relevant for their specific function.
if( check.array( clean.tokens ) ){
// perform some operations on 'clean.text': vs.var( 'input:name:tokens', clean.tokens );
// 1. if there is a space followed by a single char, remove them. vs.var( 'input:name:tokens_complete', clean.tokens_complete );
// - this is required as the index uses 2grams and sending 1grams vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete );
// - to a 2gram index when using 'type:phrase' or 'operator:and' will }
// - result in a complete failure of the query.
// 2. trim leading and trailing whitespace. // input text
// note: single digit grams are now being produced in the name.* index vs.var( 'input:name', clean.text );
var text = clean.text.replace(/( [^0-9]$)/g,'').trim();
// if the input parser has run and suggested a 'parsed_text.name' to use. // if the input parser has run and suggested a 'parsed_text.name' to use.
if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){
// mark the name as complete (user has already typed a comma)
vs.var( 'input:name:isComplete', true );
// use 'parsed_text.name' instead of 'clean.text'. // use 'parsed_text.name' instead of 'clean.text'.
text = clean.parsed_text.name; vs.var( 'input:name', clean.parsed_text.name );
} }
// input text
vs.var( 'input:name', text );
// focus point // focus point
if( check.number(clean['focus.point.lat']) && if( check.number(clean['focus.point.lat']) &&
check.number(clean['focus.point.lon']) ){ check.number(clean['focus.point.lon']) ){

14
query/view/boost_exact_matches.js

@ -27,19 +27,11 @@ module.exports = function( vs ){
vsCopy.var('phrase:analyzer').set(searchDefaults['phrase:analyzer']); vsCopy.var('phrase:analyzer').set(searchDefaults['phrase:analyzer']);
vsCopy.var('phrase:field').set(searchDefaults['phrase:field']); vsCopy.var('phrase:field').set(searchDefaults['phrase:field']);
// split the 'input:name' on whitespace // get a copy of the *complete* tokens produced from the input:name
var name = vs.var('input:name').get(), var tokens = vs.var('input:name:tokens_complete').get();
tokens = name.split(' ');
// if the query is incomplete then we need to remove
// the final (incomplete) token as it will not match
// tokens in the phrase.* index.
if( !vs.var('input:name:isComplete').get() ){
tokens.pop();
}
// no valid tokens to use, fail now, don't render this view. // no valid tokens to use, fail now, don't render this view.
if( tokens.length < 1 ){ return null; } if( !tokens || tokens.length < 1 ){ return null; }
// set 'input:name' to be only the fully completed characters // set 'input:name' to be only the fully completed characters
vsCopy.var('input:name').set( tokens.join(' ') ); vsCopy.var('input:name').set( tokens.join(' ') );

17
query/view/ngrams_last_token_only.js

@ -8,9 +8,6 @@ var peliasQuery = require('pelias-query'),
eg. if the input was "100 foo str", then 'input:name' would only be 'str' eg. if the input was "100 foo str", then 'input:name' would only be 'str'
note: it is assumed that the rest of the input is matched using another view. note: it is assumed that the rest of the input is matched using another view.
there is an additional flag 'input:name:isComplete' used to disable this view
selectively, see that section for more info.
code notes: this view makes a copy of the $vs object in order to change their code notes: this view makes a copy of the $vs object in order to change their
values without mutating the original values, which may be expected in their values without mutating the original values, which may be expected in their
unaltered form by other views. unaltered form by other views.
@ -18,19 +15,17 @@ var peliasQuery = require('pelias-query'),
module.exports = function( vs ){ module.exports = function( vs ){
// Totally disable this view when bool value 'input:name:isComplete' is true. // get a copy of the *tokens_incomplete* tokens produced from the input:name
// This is the case when the user has typed a comma, so we can assume var tokens = vs.var('input:name:tokens_incomplete').get();
// that the 'name' part of the query is now complete.
if( vs.var('input:name:isComplete').get() ){ return null; } // no valid tokens to use, fail now, don't render this view.
if( !tokens || tokens.length < 1 ){ return null; }
// make a copy Vars so we don't mutate the original // make a copy Vars so we don't mutate the original
var vsCopy = new peliasQuery.Vars( vs.export() ); var vsCopy = new peliasQuery.Vars( vs.export() );
// get the input 'name' variable
var name = vs.var('input:name').get();
// set the 'name' variable in the copy to only the last token // set the 'name' variable in the copy to only the last token
vsCopy.var('input:name').set( name.substr( name.lastIndexOf(' ')+1 ) ); vsCopy.var('input:name').set( tokens.join(' ') );
// return the view rendered using the copy // return the view rendered using the copy
return ngrams_strict( vsCopy ); return ngrams_strict( vsCopy );

25
query/view/phrase_first_tokens_only.js

@ -7,9 +7,6 @@ var peliasQuery = require('pelias-query');
eg. if the input was "100 foo str", then 'input:name' would only be '100 foo' eg. if the input was "100 foo str", then 'input:name' would only be '100 foo'
note: it is assumed that the rest of the input is matched using another view. note: it is assumed that the rest of the input is matched using another view.
there is an additional flag 'input:name:isComplete' used to disable this view
selectively, see that section for more info.
code notes: this view makes a copy of the $vs object in order to change their code notes: this view makes a copy of the $vs object in order to change their
values without mutating the original values, which may be expected in their values without mutating the original values, which may be expected in their
unaltered form by other views. unaltered form by other views.
@ -17,27 +14,17 @@ var peliasQuery = require('pelias-query');
module.exports = function( vs ){ module.exports = function( vs ){
// Don't mutate the name variable when 'input:name:isComplete' is true. // get a copy of the *complete* tokens produced from the input:name
// This is the case when the user has typed a comma, so we can assume var tokens = vs.var('input:name:tokens_complete').get();
// that the 'name' part of the query is now complete.
if( vs.var('input:name:isComplete').get() ){ // no valid tokens to use, fail now, don't render this view.
// return the view rendered using the original vars if( !tokens || tokens.length < 1 ){ return null; }
return peliasQuery.view.phrase( vs );
}
// make a copy Vars so we don't mutate the original // make a copy Vars so we don't mutate the original
var vsCopy = new peliasQuery.Vars( vs.export() ); var vsCopy = new peliasQuery.Vars( vs.export() );
// get the input 'name' variable and split in to tokens
var name = vs.var('input:name').get(),
tokens = name.split(' ');
// single token only, abort (we don't want the *last* token)
// return null here will completely disable the view.
if( tokens.length < 2 ){ return null; }
// set the 'name' variable in the copy to all but the last token // set the 'name' variable in the copy to all but the last token
vsCopy.var('input:name').set( name.substr( 0, name.lastIndexOf(' ') ) ); vsCopy.var('input:name').set( tokens.join(' ') );
// return the view rendered using the copy // return the view rendered using the copy
return peliasQuery.view.phrase( vsCopy ); return peliasQuery.view.phrase( vsCopy );

19
query/view/pop_subquery.js

@ -1,5 +1,6 @@
var peliasQuery = require('pelias-query'); var peliasQuery = require('pelias-query'),
check = require('check-types');
/** /**
Population / Popularity subquery Population / Popularity subquery
@ -12,5 +13,21 @@ module.exports = function( vs ){
view.match['name.default'].analyzer = vs.var('phrase:analyzer'); view.match['name.default'].analyzer = vs.var('phrase:analyzer');
delete view.match['name.default'].boost; delete view.match['name.default'].boost;
// only use complete tokens against the phase index (where possible).
var completeTokens = vs.var('input:name:tokens_complete').get(),
incompleteTokens = vs.var('input:name:tokens_incomplete').get();
// if the tokenizer has run (autocomplete only) then we will combine the
// 'complete' tokens with the 'incomplete' tokens, the resuting array differs
// slightly from the 'input:name:tokens' array as some tokens might have been
// removed in the process; such as single grams which are not present in then
// ngrams index.
if( check.array( completeTokens ) && check.array( incompleteTokens ) ){
var combined = completeTokens.concat( incompleteTokens );
if( combined.length ){
view.match['name.default'].query = combined.join(' ');
}
}
return view; return view;
}; };

95
sanitiser/_tokenizer.js

@ -0,0 +1,95 @@
var check = require('check-types');
/**
simplified version of the elaticsearch tokenizer, used in order to
be able to detect which tokens are 'complete' (user has finished typing them)
or 'incomplete' (the user has possibly only typed part of the token).
note: we don't need to strip punctuation as that will be handled on the
elasticsearch side, so sending a token such as 'st.' is not an issue, these
tokens should *not* be modified as the anaylsis can use the punctuation to
infer meaning.
note: this sanitizer should run *after* the '_text' sanitizer so it can
use the output of clean.parsed_text where available.
**/
function sanitize( raw, clean ){
// error & warning messages
var messages = { errors: [], warnings: [] };
// this is the string we will use for analysis
var text = clean.text;
// a boolean to track whether the input parser successfully ran; or not.
var inputParserRanSuccessfully = false;
// if the text parser has run then we only tokenize the 'name' section
// of the 'parsed_text' object, ignoring the 'admin' parts.
if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){
inputParserRanSuccessfully = true;
text = clean.parsed_text.name; // use this string instead
}
// always set 'clean.tokens*' arrays for consistency and to avoid upstream errors.
clean.tokens = [];
clean.tokens_complete = [];
clean.tokens_incomplete = [];
// sanity check that the text is valid.
if( check.nonEmptyString( text ) ){
// split according to the regex used in the elasticsearch tokenizer
// see: https://github.com/pelias/schema/blob/master/settings.js
// see: settings.analysis.tokenizer.peliasNameTokenizer
clean.tokens = text
.split(/[\s,\\\/]+/) // split on delimeters
.filter(function(el){return el;}); // remove empty elements
}
/**
the following section splits the tokens in to two arrays called
'tokens_complete' and 'tokens_incomplete'.
it also strips any tokens from 'tokens_incomplete' which might not
match the ngrams index (such as single grams not stored in the index).
**/
// split the tokens in to 'complete' and 'incomplete'.
if( clean.tokens.length ){
// if all the tokens are complete, simply copy them from clean.tokens
if( inputParserRanSuccessfully ){
// all these tokens are complete!
clean.tokens_complete = clean.tokens.slice();
// user hasn't finished typing yet
} else {
// make a copy of the tokens and remove the last element
var tokensCopy = clean.tokens.slice(),
lastToken = tokensCopy.pop();
// set all but the last token as 'complete'
clean.tokens_complete = tokensCopy;
/**
if the last token is a single non-numeric character then we must discard it.
at time of writing, single non-numeric ngrams are not stored in the index,
sending them as part of the query would result in 0 documents being returned.
**/
if( lastToken && ( lastToken.length > 1 || lastToken.match(/[0-9]/) ) ){
clean.tokens_incomplete = [ lastToken ];
}
}
}
return messages;
}
// export function
module.exports = sanitize;

1
sanitiser/autocomplete.js

@ -4,6 +4,7 @@ var sanitizeAll = require('../sanitiser/sanitizeAll'),
sanitizers = { sanitizers = {
singleScalarParameters: require('../sanitiser/_single_scalar_parameters'), singleScalarParameters: require('../sanitiser/_single_scalar_parameters'),
text: require('../sanitiser/_text'), text: require('../sanitiser/_text'),
tokenizer: require('../sanitiser/_tokenizer'),
size: require('../sanitiser/_size')(10, 10, 10), size: require('../sanitiser/_size')(10, 10, 10),
layers: require('../sanitiser/_targets')('layers', type_mapping.layer_mapping), layers: require('../sanitiser/_targets')('layers', type_mapping.layer_mapping),
sources: require('../sanitiser/_targets')('sources', type_mapping.source_mapping), sources: require('../sanitiser/_targets')('sources', type_mapping.source_mapping),

18
test/unit/fixture/autocomplete_linguistic_final_token.js

@ -7,15 +7,25 @@ module.exports = {
'must': [{ 'must': [{
'match': { 'match': {
'name.default': { 'name.default': {
'analyzer': 'peliasQueryPartialToken', 'analyzer': 'peliasQueryFullToken',
'boost': 100, 'boost': 1,
'slop': 3,
'query': 'one', 'query': 'one',
'type': 'phrase', 'type': 'phrase'
'operator': 'and'
} }
} }
}], }],
'should':[{ 'should':[{
'match': {
'phrase.default': {
'analyzer': 'peliasPhrase',
'boost': 1,
'slop': 3,
'query': 'one',
'type': 'phrase'
}
}
},{
'function_score': { 'function_score': {
'query': { 'query': {
'match': { 'match': {

40
test/unit/query/autocomplete.js

@ -13,7 +13,10 @@ module.exports.tests.interface = function(test, common) {
module.exports.tests.query = function(test, common) { module.exports.tests.query = function(test, common) {
test('valid lingustic-only autocomplete', function(t) { test('valid lingustic-only autocomplete', function(t) {
var query = generate({ var query = generate({
text: 'test' text: 'test',
tokens: ['test'],
tokens_complete: [],
tokens_incomplete: ['test']
}); });
var compiled = JSON.parse( JSON.stringify( query ) ); var compiled = JSON.parse( JSON.stringify( query ) );
@ -25,7 +28,10 @@ module.exports.tests.query = function(test, common) {
test('valid lingustic autocomplete with 3 tokens', function(t) { test('valid lingustic autocomplete with 3 tokens', function(t) {
var query = generate({ var query = generate({
text: 'one two three' text: 'one two three',
tokens: ['one','two','three'],
tokens_complete: ['one','two'],
tokens_incomplete: ['three']
}); });
var compiled = JSON.parse( JSON.stringify( query ) ); var compiled = JSON.parse( JSON.stringify( query ) );
@ -42,7 +48,10 @@ module.exports.tests.query = function(test, common) {
name: 'one two', name: 'one two',
regions: [ 'one two', 'three' ], regions: [ 'one two', 'three' ],
admin_parts: 'three' admin_parts: 'three'
} },
tokens: ['one','two'],
tokens_complete: ['one','two'],
tokens_incomplete: []
}); });
var compiled = JSON.parse( JSON.stringify( query ) ); var compiled = JSON.parse( JSON.stringify( query ) );
@ -57,7 +66,10 @@ module.exports.tests.query = function(test, common) {
// note: if 1 grams are enabled at a later date, remove this behaviour. // note: if 1 grams are enabled at a later date, remove this behaviour.
test('valid lingustic autocomplete final token', function(t) { test('valid lingustic autocomplete final token', function(t) {
var query = generate({ var query = generate({
text: 'one t' text: 'one t',
tokens: ['one','t'],
tokens_complete: ['one'],
tokens_incomplete: []
}); });
var compiled = JSON.parse( JSON.stringify( query ) ); var compiled = JSON.parse( JSON.stringify( query ) );
@ -71,7 +83,10 @@ module.exports.tests.query = function(test, common) {
var query = generate({ var query = generate({
text: 'test', text: 'test',
'focus.point.lat': 29.49136, 'focus.point.lat': 29.49136,
'focus.point.lon': -82.50622 'focus.point.lon': -82.50622,
tokens: ['test'],
tokens_complete: [],
tokens_incomplete: ['test']
}); });
var compiled = JSON.parse( JSON.stringify( query ) ); var compiled = JSON.parse( JSON.stringify( query ) );
@ -85,7 +100,10 @@ module.exports.tests.query = function(test, common) {
var query = generate({ var query = generate({
text: 'test', text: 'test',
'focus.point.lat': 0, 'focus.point.lat': 0,
'focus.point.lon': 0 'focus.point.lon': 0,
tokens: ['test'],
tokens_complete: [],
tokens_incomplete: ['test']
}); });
var compiled = JSON.parse( JSON.stringify( query ) ); var compiled = JSON.parse( JSON.stringify( query ) );
@ -98,7 +116,10 @@ module.exports.tests.query = function(test, common) {
test('valid sources filter', function(t) { test('valid sources filter', function(t) {
var query = generate({ var query = generate({
'text': 'test', 'text': 'test',
'sources': ['test_source'] 'sources': ['test_source'],
tokens: ['test'],
tokens_complete: [],
tokens_incomplete: ['test']
}); });
var compiled = JSON.parse( JSON.stringify( query ) ); var compiled = JSON.parse( JSON.stringify( query ) );
@ -115,7 +136,10 @@ module.exports.tests.query = function(test, common) {
name: 'k road', name: 'k road',
street: 'k road', street: 'k road',
regions: [ 'laird' ] regions: [ 'laird' ]
} },
tokens: ['k', 'road'],
tokens_complete: ['k', 'road'],
tokens_incomplete: []
}); });
var compiled = JSON.parse( JSON.stringify( query ) ); var compiled = JSON.parse( JSON.stringify( query ) );

1
test/unit/run.js

@ -46,6 +46,7 @@ var tests = [
require('./sanitiser/_sources'), require('./sanitiser/_sources'),
require('./sanitiser/_sources_and_layers'), require('./sanitiser/_sources_and_layers'),
require('./sanitiser/_text'), require('./sanitiser/_text'),
require('./sanitiser/_tokenizer'),
require('./sanitiser/_deprecate_quattroshapes'), require('./sanitiser/_deprecate_quattroshapes'),
require('./src/backend'), require('./src/backend'),
require('./sanitiser/autocomplete'), require('./sanitiser/autocomplete'),

425
test/unit/sanitiser/_tokenizer.js

@ -0,0 +1,425 @@
var sanitiser = require('../../../sanitiser/_tokenizer');
module.exports.tests = {};
module.exports.tests.sanity_checks = function(test, common) {
test('clean.text not set', function(t) {
var clean = {}; // clean.text not set
var messages = sanitiser({}, clean);
// no tokens produced
t.deepEquals(clean.tokens, [], 'no tokens');
t.deepEquals(clean.tokens_complete, [], 'no tokens');
t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('clean.text not a string', function(t) {
var clean = { text: {} }; // clean.text not a string
var messages = sanitiser({}, clean);
// no tokens produced
t.deepEquals(clean.tokens, [], 'no tokens');
t.deepEquals(clean.tokens_complete, [], 'no tokens');
t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('empty string', function(t) {
var clean = { text: '' };
var messages = sanitiser({}, clean);
// no tokens produced
t.deepEquals(clean.tokens, [], 'no tokens');
t.deepEquals(clean.tokens_complete, [], 'no tokens');
t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) {
var clean = { parsed_text: { text: {} } };
var messages = sanitiser({}, clean);
// no tokens produced
t.deepEquals(clean.tokens, [], 'no tokens');
t.deepEquals(clean.tokens_complete, [], 'no tokens');
t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('favor clean.parsed_text.name over clean.text', function(t) {
var clean = { parsed_text: { name: 'foo' }, text: 'bar' };
var messages = sanitiser({}, clean);
// favor clean.parsed_text.name over clean.text
t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name');
t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name');
t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
};
module.exports.tests.space_delimiter = function(test, common) {
test('space delimiter - simple', function(t) {
var clean = { text: '30 west 26th street new york' };
var messages = sanitiser({}, clean);
// tokens produced
t.deepEquals(clean.tokens, [
'30',
'west',
'26th',
'street',
'new',
'york'
], 'tokens produced');
// all but last token marked as 'complete'
t.deepEquals(clean.tokens_complete, [
'30',
'west',
'26th',
'street',
'new'
], 'tokens produced');
// last token marked as 'incomplete'
t.deepEquals(clean.tokens_incomplete, [
'york'
], 'tokens produced');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('space delimiter - multiple spaces / other whitespace', function(t) {
var clean = { text: ' 30 west \t26th \nstreet new york ' };
var messages = sanitiser({}, clean);
// tokens produced
t.deepEquals(clean.tokens, [
'30',
'west',
'26th',
'street',
'new',
'york'
], 'tokens produced');
// all but last token marked as 'complete'
t.deepEquals(clean.tokens_complete, [
'30',
'west',
'26th',
'street',
'new'
], 'tokens produced');
// last token marked as 'incomplete'
t.deepEquals(clean.tokens_incomplete, [
'york'
], 'tokens produced');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
};
module.exports.tests.comma_delimiter = function(test, common) {
test('comma delimiter - simple', function(t) {
var clean = { text: '30 west 26th street, new york' };
var messages = sanitiser({}, clean);
// tokens produced
t.deepEquals(clean.tokens, [
'30',
'west',
'26th',
'street',
'new',
'york'
], 'tokens produced');
// all but last token marked as 'complete'
t.deepEquals(clean.tokens_complete, [
'30',
'west',
'26th',
'street',
'new'
], 'tokens produced');
// last token marked as 'incomplete'
t.deepEquals(clean.tokens_incomplete, [
'york'
], 'tokens produced');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('comma delimiter - multiple commas', function(t) {
var clean = { text: ',30 west 26th street,,, new york,' };
var messages = sanitiser({}, clean);
// tokens produced
t.deepEquals(clean.tokens, [
'30',
'west',
'26th',
'street',
'new',
'york'
], 'tokens produced');
// all but last token marked as 'complete'
t.deepEquals(clean.tokens_complete, [
'30',
'west',
'26th',
'street',
'new'
], 'tokens produced');
// last token marked as 'incomplete'
t.deepEquals(clean.tokens_incomplete, [
'york'
], 'tokens produced');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
};
module.exports.tests.forward_slash_delimiter = function(test, common) {
test('forward slash delimiter - simple', function(t) {
var clean = { text: 'Bedell Street/133rd Avenue' };
var messages = sanitiser({}, clean);
// tokens produced
t.deepEquals(clean.tokens, [
'Bedell',
'Street',
'133rd',
'Avenue'
], 'tokens produced');
// all but last token marked as 'complete'
t.deepEquals(clean.tokens_complete, [
'Bedell',
'Street',
'133rd'
], 'tokens produced');
// last token marked as 'incomplete'
t.deepEquals(clean.tokens_incomplete, [
'Avenue'
], 'tokens produced');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('forward slash - multiple slashes', function(t) {
var clean = { text: '/Bedell Street//133rd Avenue/' };
var messages = sanitiser({}, clean);
// tokens produced
t.deepEquals(clean.tokens, [
'Bedell',
'Street',
'133rd',
'Avenue'
], 'tokens produced');
// all but last token marked as 'complete'
t.deepEquals(clean.tokens_complete, [
'Bedell',
'Street',
'133rd'
], 'tokens produced');
// last token marked as 'incomplete'
t.deepEquals(clean.tokens_incomplete, [
'Avenue'
], 'tokens produced');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
};
module.exports.tests.final_token_single_gram = function(test, common) {
test('final token single gram - numeric', function(t) {
var clean = { text: 'grolmanstrasse 1' };
var messages = sanitiser({}, clean);
// tokens produced
t.deepEquals(clean.tokens, [
'grolmanstrasse',
'1'
], 'tokens produced');
// all but last token marked as 'complete'
t.deepEquals(clean.tokens_complete, [
'grolmanstrasse',
], 'tokens produced');
// last token marked as 'incomplete'
t.deepEquals(clean.tokens_incomplete, [
'1'
], 'tokens produced');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('final token single gram - non-numeric', function(t) {
var clean = { text: 'grolmanstrasse a' };
var messages = sanitiser({}, clean);
// tokens produced
t.deepEquals(clean.tokens, [
'grolmanstrasse',
'a'
], 'tokens produced');
// all but last token marked as 'complete'
t.deepEquals(clean.tokens_complete, [
'grolmanstrasse',
], 'tokens produced');
// last token removed!
t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
};
module.exports.tests.back_slash_delimiter = function(test, common) {
test('back slash delimiter - simple', function(t) {
var clean = { text: 'Bedell Street\\133rd Avenue' };
var messages = sanitiser({}, clean);
// tokens produced
t.deepEquals(clean.tokens, [
'Bedell',
'Street',
'133rd',
'Avenue'
], 'tokens produced');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
test('back slash - multiple slashes', function(t) {
var clean = { text: '\\Bedell Street\\\\133rd Avenue\\' };
var messages = sanitiser({}, clean);
// tokens produced
t.deepEquals(clean.tokens, [
'Bedell',
'Street',
'133rd',
'Avenue'
], 'tokens produced');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
};
module.exports.tests.mixed_delimiter = function(test, common) {
test('mixed delimiters', function(t) {
var clean = { text: ',/Bedell Street\\, \n\t ,\\//133rd Avenue, /\n/' };
var messages = sanitiser({}, clean);
// tokens produced
t.deepEquals(clean.tokens, [
'Bedell',
'Street',
'133rd',
'Avenue'
], 'tokens produced');
// no errors/warnings produced
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('SANITISER _tokenizer: ' + name, testFunction);
}
for( var testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};

5
test/unit/sanitiser/autocomplete.js

@ -4,7 +4,10 @@ module.exports.tests = {};
module.exports.tests.sanitisers = function(test, common) { module.exports.tests.sanitisers = function(test, common) {
test('check sanitiser list', function (t) { test('check sanitiser list', function (t) {
var expected = ['singleScalarParameters', 'text', 'size', 'layers', 'sources', 'sources_and_layers', 'private', 'geo_autocomplete' ]; var expected = [
'singleScalarParameters', 'text', 'tokenizer', 'size', 'layers', 'sources',
'sources_and_layers', 'private', 'geo_autocomplete'
];
t.deepEqual(Object.keys(autocomplete.sanitiser_list), expected); t.deepEqual(Object.keys(autocomplete.sanitiser_list), expected);
t.end(); t.end();
}); });

Loading…
Cancel
Save