Merge pull request #526 from pelias/missinglink

autocomplete milestone
9 years ago · e04efed98b
38 changed files with 997 additions and 177 deletions
--- a/middleware/geocodeJSON.js
+++ b/middleware/geocodeJSON.js
@ -56,6 +56,10 @@ function convertToGeocodeJSON(req, res, next, opts) {
  // Helpful for debugging and understanding how the input impacts results.
  res.body.geocoding.query = req.clean;

+  // remove arrays produced by the tokenizer (only intended to be used internally).
+  delete res.body.geocoding.query.tokens_complete;
+  delete res.body.geocoding.query.tokens_incomplete;
+
  // OPTIONAL. Warnings and errors.
  addMessages(req, 'warnings', res.body.geocoding);
  addMessages(req, 'errors', res.body.geocoding);
--- a/package.json
+++ b/package.json
@ -68,7 +68,7 @@
    "precommit-hook": "^3.0.0",
    "proxyquire": "^1.4.0",
    "tap-dot": "1.0.5",
-    "tape": "^4.4.0"
+    "tape": "^4.5.1"
  },
  "pre-commit": [
    "lint",
--- a/query/autocomplete.js
+++ b/query/autocomplete.js
@ -9,7 +9,9 @@ var views = {
  ngrams_strict:              require('./view/ngrams_strict'),
  focus_selected_layers:      require('./view/focus_selected_layers'),
  ngrams_last_token_only:     require('./view/ngrams_last_token_only'),
-  phrase_first_tokens_only:   require('./view/phrase_first_tokens_only')
+  phrase_first_tokens_only:   require('./view/phrase_first_tokens_only'),
+  pop_subquery:               require('./view/pop_subquery'),
+  boost_exact_matches:        require('./view/boost_exact_matches')
 };

 //------------------------------
@ -32,14 +34,16 @@ query.score( peliasQuery.view.admin('country_a') );
 query.score( peliasQuery.view.admin('region') );
 query.score( peliasQuery.view.admin('region_a') );
 query.score( peliasQuery.view.admin('county') );
+query.score( peliasQuery.view.admin('borough') );
 query.score( peliasQuery.view.admin('localadmin') );
 query.score( peliasQuery.view.admin('locality') );
 query.score( peliasQuery.view.admin('neighbourhood') );

 // scoring boost
+query.score( views.boost_exact_matches );
 query.score( views.focus_selected_layers( views.ngrams_strict ) );
-query.score( peliasQuery.view.popularity( views.ngrams_strict ) );
-query.score( peliasQuery.view.population( views.ngrams_strict ) );
+query.score( peliasQuery.view.popularity( views.pop_subquery ) );
+query.score( peliasQuery.view.population( views.pop_subquery ) );

 // non-scoring hard filters
 query.filter( peliasQuery.view.sources );
@ -59,29 +63,28 @@ function generateQuery( clean ){
    vs.var( 'sources', clean.sources );
  }

-  // mark the name as incomplete (user has not yet typed a comma)
-  vs.var( 'input:name:isComplete', false );
-
-  // perform some operations on 'clean.text':
-  // 1. if there is a space followed by a single char, remove them.
-  //  - this is required as the index uses 2grams and sending 1grams
-  //  - to a 2gram index when using 'type:phrase' or 'operator:and' will
-  //  - result in a complete failure of the query.
-  // 2. trim leading and trailing whitespace.
-  var text = clean.text.replace(/( .$)/g,'').trim();
-
-  // if the input parser has run and suggested a 'parsed_text.name' to use.
-  if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){
-
-    // mark the name as complete (user has already typed a comma)
-    vs.var( 'input:name:isComplete', true );
-
-    // use 'parsed_text.name' instead of 'clean.text'.
-    text = clean.parsed_text.name;
+  // pass the input tokens to the views so they can choose which tokens
+  // are relevant for their specific function.
+  if( check.array( clean.tokens ) ){
+    vs.var( 'input:name:tokens', clean.tokens );
+    vs.var( 'input:name:tokens_complete', clean.tokens_complete );
+    vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete );
  }

  // input text
-  vs.var( 'input:name', text );
+  vs.var( 'input:name', clean.text );
+
+  // if the tokenizer has run then we set 'input:name' to as the combination of the
+  // 'complete' tokens with the 'incomplete' tokens, the resuting array differs
+  // slightly from the 'input:name:tokens' array as some tokens might have been
+  // removed in the process; such as single grams which are not present in then
+  // ngrams index.
+  if( check.array( clean.tokens_complete ) && check.array( clean.tokens_incomplete ) ){
+    var combined = clean.tokens_complete.concat( clean.tokens_incomplete );
+    if( combined.length ){
+      vs.var( 'input:name', combined.join(' ') );
+    }
+  }

  // focus point
  if( check.number(clean['focus.point.lat']) &&
--- a/query/autocomplete_defaults.js
+++ b/query/autocomplete_defaults.js
@ -20,20 +20,20 @@ module.exports = _.merge({}, peliasQuery.defaults, {
  'boundary:rect:type': 'indexed',
  'boundary:rect:_cache': true,

-  'ngram:analyzer': 'peliasPhrase',
+  'ngram:analyzer': 'peliasQueryPartialToken',
  'ngram:field': 'name.default',
  'ngram:boost': 100,

-  'phrase:analyzer': 'peliasPhrase',
-  'phrase:field': 'phrase.default',
+  'phrase:analyzer': 'peliasQueryFullToken',
+  'phrase:field': 'name.default',
  'phrase:boost': 1,
-  'phrase:slop': 2,
+  'phrase:slop': 3,

  'focus:function': 'linear',
  'focus:offset': '0km',
  'focus:scale': '250km',
  'focus:decay': 0.5,
-  'focus:weight': 10,
+  'focus:weight': 40,

  'function_score:score_mode': 'avg',
  'function_score:boost_mode': 'multiply',
@ -82,6 +82,10 @@ module.exports = _.merge({}, peliasQuery.defaults, {
  'admin:neighbourhood:field': 'parent.neighbourhood',
  'admin:neighbourhood:boost': 200,

+  'admin:borough:analyzer': 'peliasAdmin',
+  'admin:borough:field': 'parent.borough',
+  'admin:borough:boost': 600,
+
  'popularity:field': 'popularity',
  'popularity:modifier': 'log1p',
  'popularity:max_boost': 20,
--- a/query/reverse_defaults.js
+++ b/query/reverse_defaults.js
@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, {
  'boundary:rect:type': 'indexed',
  'boundary:rect:_cache': true,

-  'ngram:analyzer': 'peliasOneEdgeGram',
+  'ngram:analyzer': 'peliasQueryPartialToken',
  'ngram:field': 'name.default',
  'ngram:boost': 1,

--- a/query/search_defaults.js
+++ b/query/search_defaults.js
@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, {
  'boundary:rect:type': 'indexed',
  'boundary:rect:_cache': true,

-  'ngram:analyzer': 'peliasOneEdgeGram',
+  'ngram:analyzer': 'peliasIndexOneEdgeGram',
  'ngram:field': 'name.default',
  'ngram:boost': 1,

--- a/query/text_parser.js
+++ b/query/text_parser.js
@ -8,7 +8,7 @@ when we can't identify parts of an address. This shouldn't contain fields like c
 or postalcode because we should only try to match those when we're sure that's what they are.
 */
 var adminFields = placeTypes.concat([
-  'region_a',
+  'region_a'
 ]);

 /**
--- a/query/view/boost_exact_matches.js
+++ b/query/view/boost_exact_matches.js
@ -0,0 +1,40 @@
+
+var peliasQuery = require('pelias-query'),
+    searchDefaults = require('../search_defaults');
+
+/**
+  This view (unfortunately) requires autocomplete to use the phrase.* index.
+
+  ideally we wouldn't need to use this, but at time of writing we are unable
+  to distinguish between 'complete tokens' and 'grams' in the name.* index.
+
+  this view was introduced in order to score exact matches higher than partial
+  matches, without it we find results such as "Clayton Avenue" appearing first
+  in the results list for the query "Clay Av".
+
+  the view uses some of the values from the 'search_defaults.js' file to add an
+  additional 'SHOULD' condition which scores exact matches slighly higher
+  than partial matches.
+**/
+
+module.exports = function( vs ){
+
+  // make a copy of the variables so we don't interfere with the values
+  // passed to other views.
+  var vsCopy = new peliasQuery.Vars( vs.export() );
+
+  // copy phrase:* values from search defaults
+  vsCopy.var('phrase:analyzer').set(searchDefaults['phrase:analyzer']);
+  vsCopy.var('phrase:field').set(searchDefaults['phrase:field']);
+
+  // get a copy of the *complete* tokens produced from the input:name
+  var tokens = vs.var('input:name:tokens_complete').get();
+
+  // no valid tokens to use, fail now, don't render this view.
+  if( !tokens || tokens.length < 1 ){ return null; }
+
+  // set 'input:name' to be only the fully completed characters
+  vsCopy.var('input:name').set( tokens.join(' ') );
+
+  return peliasQuery.view.phrase( vsCopy );
+};
--- a/query/view/ngrams_last_token_only.js
+++ b/query/view/ngrams_last_token_only.js
@ -8,9 +8,6 @@ var peliasQuery = require('pelias-query'),
  eg. if the input was "100 foo str", then 'input:name' would only be 'str'
  note: it is assumed that the rest of the input is matched using another view.

-  there is an additional flag 'input:name:isComplete' used to disable this view
-  selectively, see that section for more info.
-
  code notes: this view makes a copy of the $vs object in order to change their
  values without mutating the original values, which may be expected in their
  unaltered form by other views.
@ -18,19 +15,17 @@ var peliasQuery = require('pelias-query'),

 module.exports = function( vs ){

-  // Totally disable this view when bool value 'input:name:isComplete' is true.
-  // This is the case when the user has typed a comma, so we can assume
-  // that the 'name' part of the query is now complete.
-  if( vs.var('input:name:isComplete').get() ){ return null; }
+  // get a copy of the *tokens_incomplete* tokens produced from the input:name
+  var tokens = vs.var('input:name:tokens_incomplete').get();
+
+  // no valid tokens to use, fail now, don't render this view.
+  if( !tokens || tokens.length < 1 ){ return null; }

  // make a copy Vars so we don't mutate the original
  var vsCopy = new peliasQuery.Vars( vs.export() );

-  // get the input 'name' variable
-  var name = vs.var('input:name').get();
-
  // set the 'name' variable in the copy to only the last token
-  vsCopy.var('input:name').set( name.substr( name.lastIndexOf(' ')+1 ) );
+  vsCopy.var('input:name').set( tokens.join(' ') );

  // return the view rendered using the copy
  return ngrams_strict( vsCopy );
--- a/query/view/phrase_first_tokens_only.js
+++ b/query/view/phrase_first_tokens_only.js
@ -7,9 +7,6 @@ var peliasQuery = require('pelias-query');
  eg. if the input was "100 foo str", then 'input:name' would only be '100 foo'
  note: it is assumed that the rest of the input is matched using another view.

-  there is an additional flag 'input:name:isComplete' used to disable this view
-  selectively, see that section for more info.
-
  code notes: this view makes a copy of the $vs object in order to change their
  values without mutating the original values, which may be expected in their
  unaltered form by other views.
@ -17,27 +14,17 @@ var peliasQuery = require('pelias-query');

 module.exports = function( vs ){

-  // Don't mutate the name variable when 'input:name:isComplete' is true.
-  // This is the case when the user has typed a comma, so we can assume
-  // that the 'name' part of the query is now complete.
-  if( vs.var('input:name:isComplete').get() ){
-    // return the view rendered using the original vars
-    return peliasQuery.view.phrase( vs );
-  }
+  // get a copy of the *complete* tokens produced from the input:name
+  var tokens = vs.var('input:name:tokens_complete').get();
+
+  // no valid tokens to use, fail now, don't render this view.
+  if( !tokens || tokens.length < 1 ){ return null; }

  // make a copy Vars so we don't mutate the original
  var vsCopy = new peliasQuery.Vars( vs.export() );

-  // get the input 'name' variable and split in to tokens
-  var name = vs.var('input:name').get(),
-      tokens = name.split(' ');
-
-  // single token only, abort (we don't want the *last* token)
-  // return null here will completely disable the view.
-  if( tokens.length < 2 ){ return null; }
-
  // set the 'name' variable in the copy to all but the last token
-  vsCopy.var('input:name').set( name.substr( 0, name.lastIndexOf(' ') ) );
+  vsCopy.var('input:name').set( tokens.join(' ') );

  // return the view rendered using the copy
  return peliasQuery.view.phrase( vsCopy );
--- a/query/view/pop_subquery.js
+++ b/query/view/pop_subquery.js
@ -0,0 +1,17 @@
+
+var peliasQuery = require('pelias-query'),
+    check = require('check-types');
+
+/**
+  Population / Popularity subquery
+**/
+
+module.exports = function( vs ){
+
+  var view = peliasQuery.view.ngrams( vs );
+
+  view.match['name.default'].analyzer = vs.var('phrase:analyzer');
+  delete view.match['name.default'].boost;
+
+  return view;
+};
--- a/sanitiser/_tokenizer.js
+++ b/sanitiser/_tokenizer.js
@ -0,0 +1,112 @@
+
+var check = require('check-types');
+
+/**
+  simplified version of the elaticsearch tokenizer, used in order to
+  be able to detect which tokens are 'complete' (user has finished typing them)
+  or 'incomplete' (the user has possibly only typed part of the token).
+
+  note: we don't need to strip punctuation as that will be handled on the
+  elasticsearch side, so sending a token such as 'st.' is not an issue, these
+  tokens should *not* be modified as the anaylsis can use the punctuation to
+  infer meaning.
+
+  note: this sanitizer should run *after* the '_text' sanitizer so it can
+  use the output of clean.parsed_text where available.
+**/
+function sanitize( raw, clean ){
+
+  // error & warning messages
+  var messages = { errors: [], warnings: [] };
+
+  // this is the string we will use for analysis
+  var text = clean.text;
+
+  // a boolean to track whether the input parser successfully ran; or not.
+  var inputParserRanSuccessfully = false;
+
+  // if the text parser has run then we only tokenize the 'name' section
+  // of the 'parsed_text' object, ignoring the 'admin' parts.
+  if( clean.hasOwnProperty('parsed_text') ) {
+    inputParserRanSuccessfully = true;
+
+    // parsed_text.name is set, this is the highest priority, use this string
+    if( clean.parsed_text.hasOwnProperty('name') ){
+      text = clean.parsed_text.name; // use this string instead
+    }
+
+    // else handle the case where parsed_text.street was produced but
+    // no parsed_text.name is produced.
+    // additionally, handle the case where parsed_text.number is present
+    // note: the addressit module may also produce parsed_text.unit info
+    // for now, we discard that information as we don't have an appropriate
+    else if( clean.parsed_text.hasOwnProperty('street') ){
+      text = [
+        clean.parsed_text.number,
+        clean.parsed_text.street
+      ].filter(function(el){return el;})
+      .join(' '); // remove empty elements
+    }
+  }
+
+  // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors.
+  clean.tokens = [];
+  clean.tokens_complete = [];
+  clean.tokens_incomplete = [];
+
+  // sanity check that the text is valid.
+  if( check.nonEmptyString( text ) ){
+
+    // split according to the regex used in the elasticsearch tokenizer
+    // see: https://github.com/pelias/schema/blob/master/settings.js
+    // see: settings.analysis.tokenizer.peliasNameTokenizer
+    clean.tokens = text
+      .split(/[\s,\\\/]+/) // split on delimeters
+      .filter(function(el){return el;}); // remove empty elements
+  }
+
+  /**
+    the following section splits the tokens in to two arrays called
+    'tokens_complete' and 'tokens_incomplete'.
+
+    it also strips any tokens from 'tokens_incomplete' which might not
+    match the ngrams index (such as single grams not stored in the index).
+  **/
+
+  // split the tokens in to 'complete' and 'incomplete'.
+  if( clean.tokens.length ){
+
+    // if all the tokens are complete, simply copy them from clean.tokens
+    if( inputParserRanSuccessfully ){
+
+      // all these tokens are complete!
+      clean.tokens_complete = clean.tokens.slice();
+
+    // user hasn't finished typing yet
+    } else {
+
+      // make a copy of the tokens and remove the last element
+      var tokensCopy = clean.tokens.slice(),
+          lastToken = tokensCopy.pop();
+
+      // set all but the last token as 'complete'
+      clean.tokens_complete = tokensCopy;
+
+      /**
+        if the last token is a single non-numeric character then we must discard it.
+
+        at time of writing, single non-numeric ngrams are not stored in the index,
+        sending them as part of the query would result in 0 documents being returned.
+      **/
+      if( lastToken && ( lastToken.length > 1 || lastToken.match(/[0-9]/) ) ){
+        clean.tokens_incomplete = [ lastToken ];
+      }
+    }
+
+  }
+
+  return messages;
+}
+
+// export function
+module.exports = sanitize;
--- a/sanitiser/autocomplete.js
+++ b/sanitiser/autocomplete.js
@ -4,6 +4,7 @@ var sanitizeAll = require('../sanitiser/sanitizeAll'),
    sanitizers = {
      singleScalarParameters: require('../sanitiser/_single_scalar_parameters'),
      text: require('../sanitiser/_text'),
+      tokenizer: require('../sanitiser/_tokenizer'),
      size: require('../sanitiser/_size')(10, 10, 10),
      layers: require('../sanitiser/_targets')('layers', type_mapping.layer_mapping),
      sources: require('../sanitiser/_targets')('sources', type_mapping.source_mapping),
--- a/test/unit/fixture/autocomplete_linguistic_final_token.js
+++ b/test/unit/fixture/autocomplete_linguistic_final_token.js
@ -7,24 +7,31 @@ module.exports = {
          'must': [{
            'match': {
              'name.default': {
-                'analyzer': 'peliasPhrase',
-                'boost': 100,
+                'analyzer': 'peliasQueryFullToken',
+                'boost': 1,
+                'slop': 3,
                'query': 'one',
-                'type': 'phrase',
-                'operator': 'and'
+                'type': 'phrase'
              }
            }
          }],
          'should':[{
+            'match': {
+              'phrase.default': {
+                'analyzer': 'peliasPhrase',
+                'boost': 1,
+                'slop': 3,
+                'query': 'one',
+                'type': 'phrase'
+              }
+            }
+          },{
            'function_score': {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'one',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
@ -45,11 +52,8 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'one',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
--- a/test/unit/fixture/autocomplete_linguistic_focus.js
+++ b/test/unit/fixture/autocomplete_linguistic_focus.js
@ -7,7 +7,7 @@ module.exports = {
          'must': [{
            'match': {
              'name.default': {
-                'analyzer': 'peliasPhrase',
+                'analyzer': 'peliasQueryPartialToken',
                'boost': 100,
                'query': 'test',
                'type': 'phrase',
@ -20,7 +20,7 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
+                    'analyzer': 'peliasQueryPartialToken',
                    'boost': 100,
                    'query': 'test',
                    'type': 'phrase',
@ -40,7 +40,7 @@ module.exports = {
                    'decay': 0.5
                  }
                },
-                'weight': 10
+                'weight': 40
              }],
              'score_mode': 'avg',
              'boost_mode': 'multiply',
@ -64,11 +64,8 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'test',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
@ -89,11 +86,8 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'test',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
--- a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js
+++ b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js
@ -7,7 +7,7 @@ module.exports = {
          'must': [{
            'match': {
              'name.default': {
-                'analyzer': 'peliasPhrase',
+                'analyzer': 'peliasQueryPartialToken',
                'boost': 100,
                'query': 'test',
                'type': 'phrase',
@ -20,7 +20,7 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
+                    'analyzer': 'peliasQueryPartialToken',
                    'boost': 100,
                    'query': 'test',
                    'type': 'phrase',
@ -40,7 +40,7 @@ module.exports = {
                    'decay': 0.5
                  }
                },
-                'weight': 10
+                'weight': 40
              }],
              'score_mode': 'avg',
              'boost_mode': 'multiply',
@ -64,11 +64,8 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'test',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
@ -89,11 +86,8 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'test',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
--- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js
+++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js
@ -6,11 +6,11 @@ module.exports = {
        'bool': {
          'must': [{
            'match': {
-              'phrase.default': {
-                'analyzer': 'peliasPhrase',
+              'name.default': {
+                'analyzer': 'peliasQueryFullToken',
                'type': 'phrase',
                'boost': 1,
-                'slop': 2,
+                'slop': 3,
                'query': 'one two'
              }
            }
@ -18,7 +18,7 @@ module.exports = {
          {
            'match': {
              'name.default': {
-                'analyzer': 'peliasPhrase',
+                'analyzer': 'peliasQueryPartialToken',
                'boost': 100,
                'query': 'three',
                'type': 'phrase',
@ -26,16 +26,25 @@ module.exports = {
              }
            }
          }],
-          'should':[{
+          'should':[
+            {
+              'match': {
+                'phrase.default': {
+                  'analyzer' : 'peliasPhrase',
+                  'type' : 'phrase',
+                  'boost' : 1,
+                  'slop' : 3,
+                  'query' : 'one two'
+                }
+              }
+            },
+            {
            'function_score': {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'one two three',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
@ -56,11 +65,8 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'one two three',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
--- a/test/unit/fixture/autocomplete_linguistic_only.js
+++ b/test/unit/fixture/autocomplete_linguistic_only.js
@ -7,7 +7,7 @@ module.exports = {
          'must': [{
            'match': {
              'name.default': {
-                'analyzer': 'peliasPhrase',
+                'analyzer': 'peliasQueryPartialToken',
                'boost': 100,
                'query': 'test',
                'type': 'phrase',
@ -20,11 +20,8 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'test',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
@ -45,11 +42,8 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'test',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
--- a/test/unit/fixture/autocomplete_linguistic_with_admin.js
+++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js
@ -7,11 +7,11 @@ module.exports = {
          'must': [
            {
              'match': {
-                'phrase.default': {
-                  'analyzer': 'peliasPhrase',
+                'name.default': {
+                  'analyzer': 'peliasQueryFullToken',
                  'type': 'phrase',
                  'boost': 1,
-                  'slop': 2,
+                  'slop': 3,
                  'query': 'one two'
                }
              }
@ -54,6 +54,15 @@ module.exports = {
                }
              }
            },
+            {
+              'match': {
+                'parent.borough': {
+                  'analyzer': 'peliasAdmin',
+                  'boost': 600,
+                  'query': 'three'
+                }
+              }
+            },
            {
              'match': {
                'parent.localadmin': {
@ -81,16 +90,24 @@ module.exports = {
                }
              }
            },
+            {
+              'match': {
+                'phrase.default': {
+                  'analyzer' : 'peliasPhrase',
+                  'type' : 'phrase',
+                  'boost' : 1,
+                  'slop' : 3,
+                  'query' : 'one two'
+                }
+              }
+            },
            {
              'function_score': {
                'query': {
                  'match': {
                    'name.default': {
-                      'analyzer': 'peliasPhrase',
-                      'boost': 100,
+                      'analyzer': 'peliasQueryFullToken',
                      'query': 'one two',
-                      'type': 'phrase',
-                      'operator': 'and'
                    }
                  }
                },
@ -114,11 +131,8 @@ module.exports = {
                'query': {
                  'match': {
                    'name.default': {
-                      'analyzer': 'peliasPhrase',
-                      'boost': 100,
+                      'analyzer': 'peliasQueryFullToken',
                      'query': 'one two',
-                      'type': 'phrase',
-                      'operator': 'and'
                    }
                  }
                },
--- a/test/unit/fixture/autocomplete_single_character_street.js
+++ b/test/unit/fixture/autocomplete_single_character_street.js
@ -0,0 +1,155 @@
+
+module.exports = {
+  'query': {
+    'filtered': {
+      'query': {
+        'bool': {
+          'must': [{
+            'match': {
+              'name.default': {
+                'analyzer': 'peliasQueryFullToken',
+                'type': 'phrase',
+                'boost': 1,
+                'slop': 3,
+                'query': 'k road'
+              }
+            }
+          }],
+          'should':[
+            {
+              'match': {
+                'address_parts.street': {
+                  'query': 'k road',
+                  'boost': 5,
+                  'analyzer': 'peliasStreet'
+                }
+              }
+            }, {
+              'match': {
+                'parent.country': {
+                  'query': 'laird',
+                  'boost': 800,
+                  'analyzer': 'peliasAdmin'
+                }
+              }
+            }, {
+              'match': {
+                'parent.region': {
+                  'query': 'laird',
+                  'boost': 600,
+                  'analyzer': 'peliasAdmin'
+                }
+              }
+            }, {
+              'match': {
+                'parent.region_a': {
+                  'query': 'laird',
+                  'boost': 600,
+                  'analyzer': 'peliasAdmin'
+                }
+              }
+            }, {
+              'match': {
+                'parent.county': {
+                  'query': 'laird',
+                  'boost': 400,
+                  'analyzer': 'peliasAdmin'
+                }
+              }
+            }, {
+              'match': {
+                'parent.borough': {
+                  'analyzer': 'peliasAdmin',
+                  'boost': 600,
+                  'query': 'laird'
+                }
+              }
+            }, {
+              'match': {
+                'parent.localadmin': {
+                  'query': 'laird',
+                  'boost': 200,
+                  'analyzer': 'peliasAdmin'
+                }
+              }
+            }, {
+              'match': {
+                'parent.locality': {
+                  'query': 'laird',
+                  'boost': 200,
+                  'analyzer': 'peliasAdmin'
+                }
+              }
+            }, {
+              'match': {
+                'parent.neighbourhood': {
+                  'query': 'laird',
+                  'boost': 200,
+                  'analyzer': 'peliasAdmin'
+                }
+              }
+            },
+            {
+              'match': {
+                'phrase.default': {
+                  'analyzer' : 'peliasPhrase',
+                  'type' : 'phrase',
+                  'boost' : 1,
+                  'slop' : 3,
+                  'query' : 'k road'
+                }
+              }
+            },
+            {
+            'function_score': {
+              'query': {
+                'match': {
+                  'name.default': {
+                    'analyzer': 'peliasQueryFullToken',
+                    'query': 'k road',
+                  }
+                }
+              },
+              'max_boost': 20,
+              'score_mode': 'first',
+              'boost_mode': 'replace',
+              'functions': [{
+                'field_value_factor': {
+                  'modifier': 'log1p',
+                  'field': 'popularity',
+                  'missing': 1
+                },
+                'weight': 1
+              }]
+            }
+          },{
+            'function_score': {
+              'query': {
+                'match': {
+                  'name.default': {
+                    'analyzer': 'peliasQueryFullToken',
+                    'query': 'k road',
+                  }
+                }
+              },
+              'max_boost': 20,
+              'score_mode': 'first',
+              'boost_mode': 'replace',
+              'functions': [{
+                'field_value_factor': {
+                  'modifier': 'log1p',
+                  'field': 'population',
+                  'missing': 1
+                },
+                'weight': 3
+              }]
+            }
+          }]
+        }
+      }
+    }
+  },
+  'sort': [ '_score' ],
+  'size': 20,
+  'track_scores': true
+};
--- a/test/unit/fixture/autocomplete_with_source_filtering.js
+++ b/test/unit/fixture/autocomplete_with_source_filtering.js
@ -7,7 +7,7 @@ module.exports = {
          'must': [{
            'match': {
              'name.default': {
-                'analyzer': 'peliasPhrase',
+                'analyzer': 'peliasQueryPartialToken',
                'boost': 100,
                'query': 'test',
                'type': 'phrase',
@ -20,11 +20,8 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'test',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
@ -45,11 +42,8 @@ module.exports = {
              'query': {
                'match': {
                  'name.default': {
-                    'analyzer': 'peliasPhrase',
-                    'boost': 100,
+                    'analyzer': 'peliasQueryFullToken',
                    'query': 'test',
-                    'type': 'phrase',
-                    'operator': 'and'
                  }
                }
              },
--- a/test/unit/fixture/search_boundary_country.js
+++ b/test/unit/fixture/search_boundary_country.js
@ -18,7 +18,7 @@ module.exports = {
                'name.default': {
                  'query': 'test',
                  'boost': 1,
-                  'analyzer': 'peliasOneEdgeGram'
+                  'analyzer': 'peliasIndexOneEdgeGram'
                }
              }
            }
--- a/test/unit/fixture/search_full_address.js
+++ b/test/unit/fixture/search_full_address.js
@ -9,7 +9,7 @@ module.exports = {
            'match': {
              'name.default': {
                'query': '123 main st',
-                'analyzer': 'peliasOneEdgeGram',
+                'analyzer': 'peliasIndexOneEdgeGram',
                'boost': 1
              }
            }
--- a/test/unit/fixture/search_linguistic_bbox.js
+++ b/test/unit/fixture/search_linguistic_bbox.js
@ -9,7 +9,7 @@ module.exports = {
              'name.default': {
                'query': 'test',
                'boost': 1,
-                'analyzer': 'peliasOneEdgeGram'
+                'analyzer': 'peliasIndexOneEdgeGram'
              }
            }
          }],
--- a/test/unit/fixture/search_linguistic_focus.js
+++ b/test/unit/fixture/search_linguistic_focus.js
@ -9,7 +9,7 @@ module.exports = {
              'name.default': {
                'query': 'test',
                'boost': 1,
-                'analyzer': 'peliasOneEdgeGram'
+                'analyzer': 'peliasIndexOneEdgeGram'
              }
            }
          }],
--- a/test/unit/fixture/search_linguistic_focus_bbox.js
+++ b/test/unit/fixture/search_linguistic_focus_bbox.js
@ -9,7 +9,7 @@ module.exports = {
              'name.default': {
                'query': 'test',
                'boost': 1,
-                'analyzer': 'peliasOneEdgeGram'
+                'analyzer': 'peliasIndexOneEdgeGram'
              }
            }
          }],
--- a/test/unit/fixture/search_linguistic_focus_null_island.js
+++ b/test/unit/fixture/search_linguistic_focus_null_island.js
@ -9,7 +9,7 @@ module.exports = {
              'name.default': {
                'query': 'test',
                'boost': 1,
-                'analyzer': 'peliasOneEdgeGram'
+                'analyzer': 'peliasIndexOneEdgeGram'
              }
            }
          }],
--- a/test/unit/fixture/search_linguistic_only.js
+++ b/test/unit/fixture/search_linguistic_only.js
@ -9,7 +9,7 @@ module.exports = {
              'name.default': {
                'query': 'test',
                'boost': 1,
-                'analyzer': 'peliasOneEdgeGram'
+                'analyzer': 'peliasIndexOneEdgeGram'
              }
            }
          }],
--- a/test/unit/fixture/search_linguistic_viewport.js
+++ b/test/unit/fixture/search_linguistic_viewport.js
@ -7,7 +7,7 @@ module.exports = {
            {
              'match': {
                'name.default': {
-                  'analyzer': 'peliasOneEdgeGram',
+                  'analyzer': 'peliasIndexOneEdgeGram',
                  'boost': 1,
                  'query': 'test'
                }
--- a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js
+++ b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js
@ -7,7 +7,7 @@ module.exports = {
            {
              'match': {
                'name.default': {
-                  'analyzer': 'peliasOneEdgeGram',
+                  'analyzer': 'peliasIndexOneEdgeGram',
                  'boost': 1,
                  'query': 'test'
                }
--- a/test/unit/fixture/search_partial_address.js
+++ b/test/unit/fixture/search_partial_address.js
@ -10,7 +10,7 @@ module.exports = {
            'match': {
              'name.default': {
                'query': 'soho grand',
-                'analyzer': 'peliasOneEdgeGram',
+                'analyzer': 'peliasIndexOneEdgeGram',
                'boost': 1
              }
            }
--- a/test/unit/fixture/search_regions_address.js
+++ b/test/unit/fixture/search_regions_address.js
@ -10,7 +10,7 @@ module.exports = {
            'match': {
              'name.default': {
                'query': '1 water st',
-                'analyzer': 'peliasOneEdgeGram',
+                'analyzer': 'peliasIndexOneEdgeGram',
                'boost': 1
              }
            }
--- a/test/unit/fixture/search_with_source_filtering.js
+++ b/test/unit/fixture/search_with_source_filtering.js
@ -9,7 +9,7 @@ module.exports = {
              'name.default': {
                'query': 'test',
                'boost': 1,
-                'analyzer': 'peliasOneEdgeGram'
+                'analyzer': 'peliasIndexOneEdgeGram'
              }
            }
          }],
--- a/test/unit/query/autocomplete.js
+++ b/test/unit/query/autocomplete.js
@ -13,25 +13,31 @@ module.exports.tests.interface = function(test, common) {
 module.exports.tests.query = function(test, common) {
  test('valid lingustic-only autocomplete', function(t) {
    var query = generate({
-      text: 'test'
+      text: 'test',
+      tokens: ['test'],
+      tokens_complete: [],
+      tokens_incomplete: ['test']
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/autocomplete_linguistic_only');

-    t.deepEqual(compiled, expected, 'valid autocomplete query');
+    t.deepEqual(compiled, expected, 'autocomplete_linguistic_only');
    t.end();
  });

  test('valid lingustic autocomplete with 3 tokens', function(t) {
    var query = generate({
-      text: 'one two three'
+      text: 'one two three',
+      tokens: ['one','two','three'],
+      tokens_complete: ['one','two'],
+      tokens_incomplete: ['three']
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
-    var expected = require('../fixture/autocomplete_linguistic_multiple_tokens.js');
+    var expected = require('../fixture/autocomplete_linguistic_multiple_tokens');

-    t.deepEqual(compiled, expected, 'valid autocomplete query');
+    t.deepEqual(compiled, expected, 'autocomplete_linguistic_multiple_tokens');
    t.end();
  });

@ -42,13 +48,16 @@ module.exports.tests.query = function(test, common) {
        name: 'one two',
        regions: [ 'one two', 'three' ],
        admin_parts: 'three'
-      }
+      },
+      tokens: ['one','two'],
+      tokens_complete: ['one','two'],
+      tokens_incomplete: []
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
-    var expected = require('../fixture/autocomplete_linguistic_with_admin.js');
+    var expected = require('../fixture/autocomplete_linguistic_with_admin');

-    t.deepEqual(compiled, expected, 'valid autocomplete query');
+    t.deepEqual(compiled, expected, 'autocomplete_linguistic_with_admin');
    t.end();
  });

@ -57,13 +66,16 @@ module.exports.tests.query = function(test, common) {
  // note: if 1 grams are enabled at a later date, remove this behaviour.
  test('valid lingustic autocomplete final token', function(t) {
    var query = generate({
-      text: 'one t'
+      text: 'one t',
+      tokens: ['one','t'],
+      tokens_complete: ['one'],
+      tokens_incomplete: []
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
-    var expected = require('../fixture/autocomplete_linguistic_final_token.js');
+    var expected = require('../fixture/autocomplete_linguistic_final_token');

-    t.deepEqual(compiled, expected, 'valid autocomplete query');
+    t.deepEqual(compiled, expected, 'autocomplete_linguistic_final_token');
    t.end();
  });

@ -71,13 +83,16 @@ module.exports.tests.query = function(test, common) {
    var query = generate({
      text: 'test',
      'focus.point.lat': 29.49136,
-      'focus.point.lon': -82.50622
+      'focus.point.lon': -82.50622,
+      tokens: ['test'],
+      tokens_complete: [],
+      tokens_incomplete: ['test']
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/autocomplete_linguistic_focus');

-    t.deepEqual(compiled, expected, 'valid autocomplete query');
+    t.deepEqual(compiled, expected, 'autocomplete_linguistic_focus');
    t.end();
  });

@ -85,20 +100,26 @@ module.exports.tests.query = function(test, common) {
    var query = generate({
      text: 'test',
      'focus.point.lat': 0,
-      'focus.point.lon': 0
+      'focus.point.lon': 0,
+      tokens: ['test'],
+      tokens_complete: [],
+      tokens_incomplete: ['test']
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/autocomplete_linguistic_focus_null_island');

-    t.deepEqual(compiled, expected, 'valid autocomplete query');
+    t.deepEqual(compiled, expected, 'autocomplete_linguistic_focus_null_island');
    t.end();
  });

  test('valid sources filter', function(t) {
    var query = generate({
      'text': 'test',
-      'sources': ['test_source']
+      'sources': ['test_source'],
+      tokens: ['test'],
+      tokens_complete: [],
+      tokens_incomplete: ['test']
    });

    var compiled = JSON.parse( JSON.stringify( query ) );
@ -107,6 +128,26 @@ module.exports.tests.query = function(test, common) {
    t.deepEqual(compiled, expected, 'valid autocomplete query with source filtering');
    t.end();
  });
+
+  test('single character street address', function(t) {
+    var query = generate({
+      text: 'k road, laird',
+      parsed_text: {
+        name: 'k road',
+        street: 'k road',
+        regions: [ 'laird' ]
+      },
+      tokens: ['k', 'road'],
+      tokens_complete: ['k', 'road'],
+      tokens_incomplete: []
+    });
+
+    var compiled = JSON.parse( JSON.stringify( query ) );
+    var expected = require('../fixture/autocomplete_single_character_street');
+
+    t.deepEqual(compiled, expected, 'autocomplete_single_character_street');
+    t.end();
+  });
 };

 module.exports.all = function (tape, common) {
--- a/test/unit/query/search.js
+++ b/test/unit/query/search.js
@ -25,7 +25,7 @@ module.exports.tests.query = function(test, common) {
    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/search_linguistic_focus_bbox');

-    t.deepEqual(compiled, expected, 'valid search query');
+    t.deepEqual(compiled, expected, 'search_linguistic_focus_bbox');
    t.end();
  });

@ -42,7 +42,7 @@ module.exports.tests.query = function(test, common) {
    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/search_linguistic_bbox');

-    t.deepEqual(compiled, expected, 'valid search query');
+    t.deepEqual(compiled, expected, 'search_linguistic_bbox');
    t.end();
  });

@ -55,7 +55,7 @@ module.exports.tests.query = function(test, common) {
    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/search_linguistic_only');

-    t.deepEqual(compiled, expected, 'valid search query');
+    t.deepEqual(compiled, expected, 'search_linguistic_only');
    t.end();
  });

@ -69,7 +69,7 @@ module.exports.tests.query = function(test, common) {
    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/search_linguistic_focus');

-    t.deepEqual(compiled, expected, 'valid search query');
+    t.deepEqual(compiled, expected, 'search_linguistic_focus');
    t.end();
  });

@ -86,7 +86,7 @@ module.exports.tests.query = function(test, common) {
    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/search_linguistic_viewport');

-    t.deepEqual(compiled, expected, 'valid search query');
+    t.deepEqual(compiled, expected, 'search_linguistic_viewport');
    t.end();
  });

@ -119,7 +119,7 @@ module.exports.tests.query = function(test, common) {
    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/search_linguistic_focus_null_island');

-    t.deepEqual(compiled, expected, 'valid search query');
+    t.deepEqual(compiled, expected, 'search_linguistic_focus_null_island');
    t.end();
  });

@ -134,7 +134,7 @@ module.exports.tests.query = function(test, common) {
    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/search_full_address');

-    t.deepEqual(compiled, expected, 'valid search query');
+    t.deepEqual(compiled, expected, 'search_full_address');
    t.end();
  });

@ -149,7 +149,7 @@ module.exports.tests.query = function(test, common) {
    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/search_partial_address');

-    t.deepEqual(compiled, expected, 'valid search query');
+    t.deepEqual(compiled, expected, 'search_partial_address');
    t.end();
  });

@ -164,7 +164,7 @@ module.exports.tests.query = function(test, common) {
    var compiled = JSON.parse( JSON.stringify( query ) );
    var expected = require('../fixture/search_regions_address');

-    t.deepEqual(compiled, expected, 'valid search query');
+    t.deepEqual(compiled, expected, 'search_regions_address');
    t.end();
  });

--- a/test/unit/run.js
+++ b/test/unit/run.js
@ -46,6 +46,7 @@ var tests = [
  require('./sanitiser/_sources'),
  require('./sanitiser/_sources_and_layers'),
  require('./sanitiser/_text'),
+  require('./sanitiser/_tokenizer'),
  require('./sanitiser/_deprecate_quattroshapes'),
  require('./src/backend'),
  require('./sanitiser/autocomplete'),
--- a/test/unit/sanitiser/_tokenizer.js
+++ b/test/unit/sanitiser/_tokenizer.js
@ -0,0 +1,457 @@
+var sanitiser = require('../../../sanitiser/_tokenizer');
+
+module.exports.tests = {};
+
+module.exports.tests.sanity_checks = function(test, common) {
+  test('clean.text not set', function(t) {
+
+    var clean = {}; // clean.text not set
+    var messages = sanitiser({}, clean);
+
+    // no tokens produced
+    t.deepEquals(clean.tokens, [], 'no tokens');
+    t.deepEquals(clean.tokens_complete, [], 'no tokens');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('clean.text not a string', function(t) {
+
+    var clean = { text: {} }; // clean.text not a string
+    var messages = sanitiser({}, clean);
+
+    // no tokens produced
+    t.deepEquals(clean.tokens, [], 'no tokens');
+    t.deepEquals(clean.tokens_complete, [], 'no tokens');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('empty string', function(t) {
+
+    var clean = { text: '' };
+    var messages = sanitiser({}, clean);
+
+    // no tokens produced
+    t.deepEquals(clean.tokens, [], 'no tokens');
+    t.deepEquals(clean.tokens_complete, [], 'no tokens');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) {
+
+    var clean = { parsed_text: { text: {} } };
+    var messages = sanitiser({}, clean);
+
+    // no tokens produced
+    t.deepEquals(clean.tokens, [], 'no tokens');
+    t.deepEquals(clean.tokens_complete, [], 'no tokens');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('favor clean.parsed_text.name over clean.text', function(t) {
+
+    var clean = { parsed_text: { name: 'foo' }, text: 'bar' };
+    var messages = sanitiser({}, clean);
+
+    // favor clean.parsed_text.name over clean.text
+    t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name');
+    t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('favor clean.parsed_text street data over clean.text', function(t) {
+
+    var clean = { parsed_text: { number: '190', street: 'foo st' }, text: 'bar' };
+    var messages = sanitiser({}, clean);
+
+    // favor clean.parsed_text.name over clean.text
+    t.deepEquals(clean.tokens, [ '190', 'foo', 'st' ], 'use street name + number');
+    t.deepEquals(clean.tokens_complete, [ '190', 'foo', 'st' ], 'use street name + number');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('favor clean.parsed_text.name over clean.parsed_text street data', function(t) {
+
+    var clean = { parsed_text: { number: '190', street: 'foo st', name: 'foo' }, text: 'bar' };
+    var messages = sanitiser({}, clean);
+
+    // favor clean.parsed_text.name over all other variables
+    t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name');
+    t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.space_delimiter = function(test, common) {
+  test('space delimiter - simple', function(t) {
+
+    var clean = { text: '30 west 26th street new york' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new',
+      'york'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'york'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('space delimiter - multiple spaces / other whitespace', function(t) {
+
+    var clean = { text: ' 30  west \t26th \nstreet   new york ' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new',
+      'york'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'york'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.comma_delimiter = function(test, common) {
+  test('comma delimiter - simple', function(t) {
+
+    var clean = { text: '30 west 26th street, new york' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new',
+      'york'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'york'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('comma delimiter - multiple commas', function(t) {
+
+    var clean = { text: ',30 west 26th street,,, new york,' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new',
+      'york'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      '30',
+      'west',
+      '26th',
+      'street',
+      'new'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'york'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.forward_slash_delimiter = function(test, common) {
+  test('forward slash delimiter - simple', function(t) {
+
+    var clean = { text: 'Bedell Street/133rd Avenue' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'Bedell',
+      'Street',
+      '133rd',
+      'Avenue'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      'Bedell',
+      'Street',
+      '133rd'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'Avenue'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('forward slash - multiple slashes', function(t) {
+
+    var clean = { text: '/Bedell Street//133rd Avenue/' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'Bedell',
+      'Street',
+      '133rd',
+      'Avenue'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      'Bedell',
+      'Street',
+      '133rd'
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      'Avenue'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.final_token_single_gram = function(test, common) {
+  test('final token single gram - numeric', function(t) {
+
+    var clean = { text: 'grolmanstrasse 1' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'grolmanstrasse',
+      '1'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      'grolmanstrasse',
+    ], 'tokens produced');
+
+    // last token marked as 'incomplete'
+    t.deepEquals(clean.tokens_incomplete, [
+      '1'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('final token single gram - non-numeric', function(t) {
+
+    var clean = { text: 'grolmanstrasse a' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'grolmanstrasse',
+      'a'
+    ], 'tokens produced');
+
+    // all but last token marked as 'complete'
+    t.deepEquals(clean.tokens_complete, [
+      'grolmanstrasse',
+    ], 'tokens produced');
+
+    // last token removed!
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.back_slash_delimiter = function(test, common) {
+  test('back slash delimiter - simple', function(t) {
+
+    var clean = { text: 'Bedell Street\\133rd Avenue' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'Bedell',
+      'Street',
+      '133rd',
+      'Avenue'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+  test('back slash - multiple slashes', function(t) {
+
+    var clean = { text: '\\Bedell Street\\\\133rd Avenue\\' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'Bedell',
+      'Street',
+      '133rd',
+      'Avenue'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.tests.mixed_delimiter = function(test, common) {
+  test('mixed delimiters', function(t) {
+
+    var clean = { text: ',/Bedell Street\\, \n\t ,\\//133rd Avenue, /\n/' };
+    var messages = sanitiser({}, clean);
+
+    // tokens produced
+    t.deepEquals(clean.tokens, [
+      'Bedell',
+      'Street',
+      '133rd',
+      'Avenue'
+    ], 'tokens produced');
+
+    // no errors/warnings produced
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+};
+
+module.exports.all = function (tape, common) {
+  function test(name, testFunction) {
+    return tape('SANITISER _tokenizer: ' + name, testFunction);
+  }
+
+  for( var testCase in module.exports.tests ){
+    module.exports.tests[testCase](test, common);
+  }
+};
--- a/test/unit/sanitiser/autocomplete.js
+++ b/test/unit/sanitiser/autocomplete.js
@ -4,7 +4,10 @@ module.exports.tests = {};

 module.exports.tests.sanitisers = function(test, common) {
  test('check sanitiser list', function (t) {
-    var expected = ['singleScalarParameters', 'text', 'size', 'layers', 'sources', 'sources_and_layers', 'private', 'geo_autocomplete' ];
+    var expected = [
+      'singleScalarParameters', 'text', 'tokenizer', 'size', 'layers', 'sources',
+      'sources_and_layers', 'private', 'geo_autocomplete'
+    ];
    t.deepEqual(Object.keys(autocomplete.sanitiser_list), expected);
    t.end();
  });