add tokenizer, refactor how we determine if a token is 'complete' or 'incomplete'

9 years ago · ee73774c89
12 changed files with 616 additions and 73 deletions
--- a/query/autocomplete.js
+++ b/query/autocomplete.js
@ -63,31 +63,24 @@ function generateQuery( clean ){
    vs.var( 'sources', clean.sources );
  }

-  // mark the name as incomplete (user has not yet typed a comma)
-  vs.var( 'input:name:isComplete', false );
-
-  // perform some operations on 'clean.text':
-  // 1. if there is a space followed by a single char, remove them.
-  //  - this is required as the index uses 2grams and sending 1grams
-  //  - to a 2gram index when using 'type:phrase' or 'operator:and' will
-  //  - result in a complete failure of the query.
-  // 2. trim leading and trailing whitespace.
-  // note: single digit grams are now being produced in the name.* index
-  var text = clean.text.replace(/( [^0-9]$)/g,'').trim();
+  // pass the input tokens to the views so they can choose which tokens
+  // are relevant for their specific function.
+  if( check.array( clean.tokens ) ){
+    vs.var( 'input:name:tokens', clean.tokens );
+    vs.var( 'input:name:tokens_complete', clean.tokens_complete );
+    vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete );
+  }
+
+  // input text
+  vs.var( 'input:name', clean.text );

  // if the input parser has run and suggested a 'parsed_text.name' to use.
  if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){

-    // mark the name as complete (user has already typed a comma)
-    vs.var( 'input:name:isComplete', true );
-
    // use 'parsed_text.name' instead of 'clean.text'.
-    text = clean.parsed_text.name;
+    vs.var( 'input:name', clean.parsed_text.name );
  }

-  // input text
-  vs.var( 'input:name', text );
-
  // focus point
  if( check.number(clean['focus.point.lat']) &&
      check.number(clean['focus.point.lon']) ){
--- a/query/view/boost_exact_matches.js
+++ b/query/view/boost_exact_matches.js
@ -27,19 +27,11 @@ module.exports = function( vs ){
  vsCopy.var('phrase:analyzer').set(searchDefaults['phrase:analyzer']);
  vsCopy.var('phrase:field').set(searchDefaults['phrase:field']);

-  // split the 'input:name' on whitespace
-  var name = vs.var('input:name').get(),
-      tokens = name.split(' ');
-
-  // if the query is incomplete then we need to remove
-  // the final (incomplete) token as it will not match
-  // tokens in the phrase.* index.
-  if( !vs.var('input:name:isComplete').get() ){
-    tokens.pop();
-  }
+  // get a copy of the *complete* tokens produced from the input:name
+  var tokens = vs.var('input:name:tokens_complete').get();

  // no valid tokens to use, fail now, don't render this view.
-  if( tokens.length < 1 ){ return null; }
+  if( !tokens || tokens.length < 1 ){ return null; }

  // set 'input:name' to be only the fully completed characters
  vsCopy.var('input:name').set( tokens.join(' ') );
--- a/query/view/ngrams_last_token_only.js
+++ b/query/view/ngrams_last_token_only.js
@ -8,9 +8,6 @@ var peliasQuery = require('pelias-query'),
  eg. if the input was "100 foo str", then 'input:name' would only be 'str'
  note: it is assumed that the rest of the input is matched using another view.

-  there is an additional flag 'input:name:isComplete' used to disable this view
-  selectively, see that section for more info.
-
  code notes: this view makes a copy of the $vs object in order to change their
  values without mutating the original values, which may be expected in their
  unaltered form by other views.
@ -18,19 +15,17 @@ var peliasQuery = require('pelias-query'),

 module.exports = function( vs ){

-  // Totally disable this view when bool value 'input:name:isComplete' is true.
-  // This is the case when the user has typed a comma, so we can assume
-  // that the 'name' part of the query is now complete.
-  if( vs.var('input:name:isComplete').get() ){ return null; }
+  // get a copy of the *tokens_incomplete* tokens produced from the input:name
+  var tokens = vs.var('input:name:tokens_incomplete').get();
+
+  // no valid tokens to use, fail now, don't render this view.
+  if( !tokens || tokens.length < 1 ){ return null; }

  // make a copy Vars so we don't mutate the original
  var vsCopy = new peliasQuery.Vars( vs.export() );

-  // get the input 'name' variable
-  var name = vs.var('input:name').get();
-
  // set the 'name' variable in the copy to only the last token
-  vsCopy.var('input:name').set( name.substr( name.lastIndexOf(' ')+1 ) );
+  vsCopy.var('input:name').set( tokens.join(' ') );

  // return the view rendered using the copy
  return ngrams_strict( vsCopy );
--- a/query/view/phrase_first_tokens_only.js
+++ b/query/view/phrase_first_tokens_only.js
@ -7,9 +7,6 @@ var peliasQuery = require('pelias-query');
  eg. if the input was "100 foo str", then 'input:name' would only be '100 foo'
  note: it is assumed that the rest of the input is matched using another view.

-  there is an additional flag 'input:name:isComplete' used to disable this view
-  selectively, see that section for more info.
-
  code notes: this view makes a copy of the $vs object in order to change their
  values without mutating the original values, which may be expected in their
  unaltered form by other views.
@ -17,27 +14,17 @@ var peliasQuery = require('pelias-query');

 module.exports = function( vs ){

-  // Don't mutate the name variable when 'input:name:isComplete' is true.
-  // This is the case when the user has typed a comma, so we can assume
-  // that the 'name' part of the query is now complete.
-  if( vs.var('input:name:isComplete').get() ){
-    // return the view rendered using the original vars
-    return peliasQuery.view.phrase( vs );
-  }
+  // get a copy of the *complete* tokens produced from the input:name
+  var tokens = vs.var('input:name:tokens_complete').get();
+
+  // no valid tokens to use, fail now, don't render this view.
+  if( !tokens || tokens.length < 1 ){ return null; }

  // make a copy Vars so we don't mutate the original
  var vsCopy = new peliasQuery.Vars( vs.export() );

-  // get the input 'name' variable and split in to tokens
-  var name = vs.var('input:name').get(),
-      tokens = name.split(' ');
-
-  // single token only, abort (we don't want the *last* token)
-  // return null here will completely disable the view.
-  if( tokens.length < 2 ){ return null; }
-
  // set the 'name' variable in the copy to all but the last token
-  vsCopy.var('input:name').set( name.substr( 0, name.lastIndexOf(' ') ) );
+  vsCopy.var('input:name').set( tokens.join(' ') );

  // return the view rendered using the copy
  return peliasQuery.view.phrase( vsCopy );
--- a/query/view/pop_subquery.js
+++ b/query/view/pop_subquery.js
@ -1,5 +1,6 @@

-var peliasQuery = require('pelias-query');
+var peliasQuery = require('pelias-query'),
+    check = require('check-types');

 /**
  Population / Popularity subquery
@ -12,5 +13,21 @@ module.exports = function( vs ){
  view.match['name.default'].analyzer = vs.var('phrase:analyzer');
  delete view.match['name.default'].boost;

+  // only use complete tokens against the phase index (where possible).
+  var completeTokens = vs.var('input:name:tokens_complete').get(),
+      incompleteTokens = vs.var('input:name:tokens_incomplete').get();
+
+  // if the tokenizer has run (autocomplete only) then we will combine the
+  // 'complete' tokens with the 'incomplete' tokens, the resuting array differs
+  // slightly from the 'input:name:tokens' array as some tokens might have been
+  // removed in the process; such as single grams which are not present in then
+  // ngrams index.
+  if( check.array( completeTokens ) && check.array( incompleteTokens ) ){
+    var combined = completeTokens.concat( incompleteTokens );
+    if( combined.length ){
+      view.match['name.default'].query = combined.join(' ');
+    }
+  }
+
  return view;
 };
--- a/sanitiser/_tokenizer.js
+++ b/sanitiser/_tokenizer.js
@ -0,0 +1,95 @@
+
+var check = require('check-types');
+
+/**
+  simplified version of the elaticsearch tokenizer, used in order to
+  be able to detect which tokens are 'complete' (user has finished typing them)
+  or 'incomplete' (the user has possibly only typed part of the token).
+
+  note: we don't need to strip punctuation as that will be handled on the
+  elasticsearch side, so sending a token such as 'st.' is not an issue, these
+  tokens should *not* be modified as the anaylsis can use the punctuation to
+  infer meaning.
+
+  note: this sanitizer should run *after* the '_text' sanitizer so it can
+  use the output of clean.parsed_text where available.
+**/
+function sanitize( raw, clean ){
+
+  // error & warning messages
+  var messages = { errors: [], warnings: [] };
+
+  // this is the string we will use for analysis
+  var text = clean.text;
+
+  // a boolean to track whether the input parser successfully ran; or not.
+  var inputParserRanSuccessfully = false;
+
+  // if the text parser has run then we only tokenize the 'name' section
+  // of the 'parsed_text' object, ignoring the 'admin' parts.
+  if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){
+    inputParserRanSuccessfully = true;
+    text = clean.parsed_text.name; // use this string instead
+  }
+
+  // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors.
+  clean.tokens = [];
+  clean.tokens_complete = [];
+  clean.tokens_incomplete = [];
+
+  // sanity check that the text is valid.
+  if( check.nonEmptyString( text ) ){
+
+    // split according to the regex used in the elasticsearch tokenizer
+    // see: https://github.com/pelias/schema/blob/master/settings.js
+    // see: settings.analysis.tokenizer.peliasNameTokenizer
+    clean.tokens = text
+      .split(/[\s,\\\/]+/) // split on delimeters
+      .filter(function(el){return el;}); // remove empty elements
+  }
+
+  /**
+    the following section splits the tokens in to two arrays called
+    'tokens_complete' and 'tokens_incomplete'.
+
+    it also strips any tokens from 'tokens_incomplete' which might not
+    match the ngrams index (such as single grams not stored in the index).
+  **/
+
+  // split the tokens in to 'complete' and 'incomplete'.
+  if( clean.tokens.length ){
+
+    // if all the tokens are complete, simply copy them from clean.tokens
+    if( inputParserRanSuccessfully ){
+
+      // all these tokens are complete!
+      clean.tokens_complete = clean.tokens.slice();
+
+    // user hasn't finished typing yet
+    } else {
+
+      // make a copy of the tokens and remove the last element
+      var tokensCopy = clean.tokens.slice(),
+          lastToken = tokensCopy.pop();
+
+      // set all but the last token as 'complete'
+      clean.tokens_complete = tokensCopy;
+
+      /**
+        if the last token is a single non-numeric character then we must discard it.
+
+        at time of writing, single non-numeric ngrams are not stored in the index,
+        sending them as part of the query would result in 0 documents being returned.
+      **/
+      if( lastToken && ( lastToken.length > 1 || lastToken.match(/[0-9]/) ) ){
+        clean.tokens_incomplete = [ lastToken ];
+      }
+    }
+
+  }
+
+  return messages;
+}
+
+// export function
+module.exports = sanitize;
--- a/sanitiser/autocomplete.js
+++ b/sanitiser/autocomplete.js
@ -4,6 +4,7 @@ var sanitizeAll = require('../sanitiser/sanitizeAll'),
    sanitizers = {
      singleScalarParameters: require('../sanitiser/_single_scalar_parameters'),
      text: require('../sanitiser/_text'),
+      tokenizer: require('../sanitiser/_tokenizer'),
      size: require('../sanitiser/_size')(10, 10, 10),
      layers: require('../sanitiser/_targets')('layers', type_mapping.layer_mapping),
      sources: require('../sanitiser/_targets')('sources', type_mapping.source_mapping),
--- a/test/unit/fixture/autocomplete_linguistic_final_token.js
+++ b/test/unit/fixture/autocomplete_linguistic_final_token.js
@ -7,15 +7,25 @@ module.exports = {
          'must': [{
            'match': {
              'name.default': {
-                'analyzer': 'peliasQueryPartialToken',
-                'boost': 100,
+                'analyzer': 'peliasQueryFullToken',
+                'boost': 1,
+                'slop': 3,
                'query': 'one',
-                'type': 'phrase',
-                'operator': 'and'
+                'type': 'phrase'
              }
            }
          }],
          'should':[{
+            'match': {
+              'phrase.default': {
+                'analyzer': 'peliasPhrase',
+                'boost': 1,
+                'slop': 3,
+                'query': 'one',
+                'type': 'phrase'
+              }
+            }
+          },{
            'function_score': {
              'query': {
                'match': {
--- a/test/unit/query/autocomplete.js
+++ b/test/unit/query/autocomplete.js
@ -13,7 +13,10 @@ module.exports.tests.interface = function(test, common) {
 module.exports.tests.query = function(test, common) {
  test('valid lingustic-only autocomplete', function(t) {
    var query = generate({
-      text: 'test'
+      text: 'test',
+      tokens: ['test'],
+      tokens_complete: [],
+      tokens_incomplete: ['test']
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
@ -25,7 +28,10 @@ module.exports.tests.query = function(test, common) {

  test('valid lingustic autocomplete with 3 tokens', function(t) {
    var query = generate({
-      text: 'one two three'
+      text: 'one two three',
+      tokens: ['one','two','three'],
+      tokens_complete: ['one','two'],
+      tokens_incomplete: ['three']
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
@ -42,7 +48,10 @@ module.exports.tests.query = function(test, common) {
        name: 'one two',
        regions: [ 'one two', 'three' ],
        admin_parts: 'three'
-      }
+      },
+      tokens: ['one','two'],
+      tokens_complete: ['one','two'],
+      tokens_incomplete: []
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
@ -57,7 +66,10 @@ module.exports.tests.query = function(test, common) {
  // note: if 1 grams are enabled at a later date, remove this behaviour.
  test('valid lingustic autocomplete final token', function(t) {
    var query = generate({
-      text: 'one t'
+      text: 'one t',
+      tokens: ['one','t'],
+      tokens_complete: ['one'],
+      tokens_incomplete: []
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
@ -71,7 +83,10 @@ module.exports.tests.query = function(test, common) {
    var query = generate({
      text: 'test',
      'focus.point.lat': 29.49136,
-      'focus.point.lon': -82.50622
+      'focus.point.lon': -82.50622,
+      tokens: ['test'],
+      tokens_complete: [],
+      tokens_incomplete: ['test']
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
@ -85,7 +100,10 @@ module.exports.tests.query = function(test, common) {
    var query = generate({
      text: 'test',
      'focus.point.lat': 0,
-      'focus.point.lon': 0
+      'focus.point.lon': 0,
+      tokens: ['test'],
+      tokens_complete: [],
+      tokens_incomplete: ['test']
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
@ -98,7 +116,10 @@ module.exports.tests.query = function(test, common) {
  test('valid sources filter', function(t) {
    var query = generate({
      'text': 'test',
-      'sources': ['test_source']
+      'sources': ['test_source'],
+      tokens: ['test'],
+      tokens_complete: [],
+      tokens_incomplete: ['test']
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
@ -115,7 +136,10 @@ module.exports.tests.query = function(test, common) {
        name: 'k road',
        street: 'k road',
        regions: [ 'laird' ]
-      }
+      },
+      tokens: ['k', 'road'],
+      tokens_complete: ['k', 'road'],
+      tokens_incomplete: []
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
--- a/test/unit/run.js
+++ b/test/unit/run.js
@ -46,6 +46,7 @@ var tests = [
  require('./sanitiser/_sources'),
  require('./sanitiser/_sources_and_layers'),
  require('./sanitiser/_text'),
+  require('./sanitiser/_tokenizer'),
  require('./sanitiser/_deprecate_quattroshapes'),
  require('./src/backend'),
  require('./sanitiser/autocomplete'),
--- a/test/unit/sanitiser/_tokenizer.js
+++ b/test/unit/sanitiser/_tokenizer.js
@ -0,0 +1,425 @@
+var sanitiser = require('../../../sanitiser/_tokenizer');
+
+module.exports.tests = {};
+
+module.exports.tests.sanity_checks = function(test, common) {
+  test('clean.text not set', function(t) {
+
+    var clean = {}; // clean.text not set
+    var messages = sanitiser({}, clean);
+
+    // no tokens produced
+    t.deepEquals(clean.tokens, [], 'no tokens');
+    t.deepEquals(clean.tokens_complete, [], 'no tokens');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('clean.text not a string', function(t) {
+
+    var clean = { text: {} }; // clean.text not a string
+    var messages = sanitiser({}, clean);
+
+    // no tokens produced
+    t.deepEquals(clean.tokens, [], 'no tokens');
+    t.deepEquals(clean.tokens_complete, [], 'no tokens');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('empty string', function(t) {
+
+    var clean = { text: '' };
+    var messages = sanitiser({}, clean);
+
+    // no tokens produced
+    t.deepEquals(clean.tokens, [], 'no tokens');
+    t.deepEquals(clean.tokens_complete, [], 'no tokens');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) {
+
+    var clean = { parsed_text: { text: {} } };
+    var messages = sanitiser({}, clean);
+
+    // no tokens produced
+    t.deepEquals(clean.tokens, [], 'no tokens');
+    t.deepEquals(clean.tokens_complete, [], 'no tokens');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('favor clean.parsed_text.name over clean.text', function(t) {
+
+    var clean = { parsed_text: { name: 'foo' }, text: 'bar' };
+    var messages = sanitiser({}, clean);
+
+    // favor clean.parsed_text.name over clean.text
+    t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name');
+    t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.space_delimiter = function(test, common) {
+  test('space delimiter - simple', function(t) {
+
+    var clean = { text: '30 west 26th street new york' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new',
+      'york'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'york'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('space delimiter - multiple spaces / other whitespace', function(t) {
+
+    var clean = { text: ' 30  west \t26th \nstreet   new york ' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new',
+      'york'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'york'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.comma_delimiter = function(test, common) {
+  test('comma delimiter - simple', function(t) {
+
+    var clean = { text: '30 west 26th street, new york' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new',
+      'york'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'york'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('comma delimiter - multiple commas', function(t) {
+
+    var clean = { text: ',30 west 26th street,,, new york,' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new',
+      'york'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'york'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.forward_slash_delimiter = function(test, common) {
+  test('forward slash delimiter - simple', function(t) {
+
+    var clean = { text: 'Bedell Street/133rd Avenue' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'Bedell',
+      'Street',
+      '133rd',
+      'Avenue'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      'Bedell',
+      'Street',
+      '133rd'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'Avenue'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('forward slash - multiple slashes', function(t) {
+
+    var clean = { text: '/Bedell Street//133rd Avenue/' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'Bedell',
+      'Street',
+      '133rd',
+      'Avenue'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      'Bedell',
+      'Street',
+      '133rd'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'Avenue'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.final_token_single_gram = function(test, common) {
+  test('final token single gram - numeric', function(t) {
+
+    var clean = { text: 'grolmanstrasse 1' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'grolmanstrasse',
+      '1'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      'grolmanstrasse',
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      '1'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('final token single gram - non-numeric', function(t) {
+
+    var clean = { text: 'grolmanstrasse a' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'grolmanstrasse',
+      'a'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      'grolmanstrasse',
+    ], 'tokens produced');
+
+    // last token removed!
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.back_slash_delimiter = function(test, common) {
+  test('back slash delimiter - simple', function(t) {
+
+    var clean = { text: 'Bedell Street\\133rd Avenue' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'Bedell',
+      'Street',
+      '133rd',
+      'Avenue'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('back slash - multiple slashes', function(t) {
+
+    var clean = { text: '\\Bedell Street\\\\133rd Avenue\\' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'Bedell',
+      'Street',
+      '133rd',
+      'Avenue'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.mixed_delimiter = function(test, common) {
+  test('mixed delimiters', function(t) {
+
+    var clean = { text: ',/Bedell Street\\, \n\t ,\\//133rd Avenue, /\n/' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'Bedell',
+      'Street',
+      '133rd',
+      'Avenue'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.all = function (tape, common) {
+  function test(name, testFunction) {
+    return tape('SANITISER _tokenizer: ' + name, testFunction);
+  }
+
+  for( var testCase in module.exports.tests ){
+    module.exports.tests[testCase](test, common);
+  }
+};
--- a/test/unit/sanitiser/autocomplete.js
+++ b/test/unit/sanitiser/autocomplete.js
@ -4,7 +4,10 @@ module.exports.tests = {};

 module.exports.tests.sanitisers = function(test, common) {
  test('check sanitiser list', function (t) {
-    var expected = ['singleScalarParameters', 'text', 'size', 'layers', 'sources', 'sources_and_layers', 'private', 'geo_autocomplete' ];
+    var expected = [
+      'singleScalarParameters', 'text', 'tokenizer', 'size', 'layers', 'sources',
+      'sources_and_layers', 'private', 'geo_autocomplete'
+    ];
    t.deepEqual(Object.keys(autocomplete.sanitiser_list), expected);
    t.end();
  });