fix(autocomplete): detect the case where input text is unsubstantial

It's possible for the `text` input to /v1/autocomplete to be of non-zero length after trimming whitespace and quotes, but still be insufficient to use for geocoding. One common case is that it contains only commas, slashes, or other delimiters. Our query logic currently does not handle this case, and will generate Elasticsearch queries that do not have a primary `must` clause and end up searching every document in the index. These queries are slow, take up cluster resources, and are not useful. By detecting unsubstantial inputs, we can prevent this.
6 years ago · b1107a0c8f
2 changed files with 41 additions and 0 deletions
--- a/sanitizer/_tokenizer.js
+++ b/sanitizer/_tokenizer.js
@ -62,6 +62,9 @@ function _sanitize( raw, clean ){
    clean.tokens = text
      .split(/[\s,\\\/]+/) // split on delimeters
      .filter(function(el){return el;}); // remove empty elements
  } else {
    // text is empty, this sanitizer should be a no-op
    return messages;
  }
  /**
@ -96,6 +99,9 @@ function _sanitize( raw, clean ){
      }
    }
  } else {
    // set error if no substantial tokens were found
    messages.errors.push('invalid `text` input: must contain more than just delimiters');
  }
  return messages;
--- a/test/unit/sanitizer/_tokenizer.js
+++ b/test/unit/sanitizer/_tokenizer.js
@ -51,6 +51,41 @@ module.exports.tests.sanity_checks = function(test, common) {
    t.end();
  });
  test('just a comma - should error', function(t) {
    var clean = { text: ',' };
    var messages = sanitizer.sanitize({}, clean);
    // no tokens produced
    t.deepEquals(clean.tokens, [], 'no tokens');
    t.deepEquals(clean.tokens_complete, [], 'no tokens');
    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
    // helpful error message
    t.deepEquals(messages.errors, ['invalid `text` input: must contain more than just delimiters'], 'error produced');
    t.deepEquals(messages.warnings, [], 'no warnings');
    t.end();
  });
  test('several commas - should error', function(t) {
    var clean = { text: ',,,\\\/   ,,' };
    var messages = sanitizer.sanitize({}, clean);
    // no tokens produced
    t.deepEquals(clean.tokens, [], 'no tokens');
    t.deepEquals(clean.tokens_complete, [], 'no tokens');
    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
    // helpful error message
    t.deepEquals(messages.errors, ['invalid `text` input: must contain more than just delimiters'], 'error produced');
    t.deepEquals(messages.warnings, [], 'no warnings');
    t.end();
  });
  test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) {
    var clean = { parsed_text: { text: {} } };