From b1107a0c8fa7937fff3974222efbb7a7d0bf73d0 Mon Sep 17 00:00:00 2001
From: Julian Simioni <julian@simioni.org>
Date: Thu, 18 Oct 2018 10:32:27 -0400
Subject: [PATCH] fix(autocomplete): detect the case where input text is
 unsubstantial

It's possible for the `text` input to /v1/autocomplete to be of non-zero
length after trimming whitespace and quotes, but still be insufficient
to use for geocoding.

One common case is that it contains only commas, slashes, or other
delimiters.

Our query logic currently does not handle this case, and will generate
Elasticsearch queries that do not have a primary `must` clause and end
up searching every document in the index. These queries are slow, take
up cluster resources, and are not useful.

By detecting unsubstantial inputs, we can prevent this.
---
 sanitizer/_tokenizer.js           |  6 ++++++
 test/unit/sanitizer/_tokenizer.js | 35 +++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js
index 081a9121..2be5eeee 100644
--- a/sanitizer/_tokenizer.js
+++ b/sanitizer/_tokenizer.js
@@ -62,6 +62,9 @@ function _sanitize( raw, clean ){
     clean.tokens = text
       .split(/[\s,\\\/]+/) // split on delimeters
       .filter(function(el){return el;}); // remove empty elements
+  } else {
+    // text is empty, this sanitizer should be a no-op
+    return messages;
   }
 
   /**
@@ -96,6 +99,9 @@ function _sanitize( raw, clean ){
       }
     }
 
+  } else {
+    // set error if no substantial tokens were found
+    messages.errors.push('invalid `text` input: must contain more than just delimiters');
   }
 
   return messages;
diff --git a/test/unit/sanitizer/_tokenizer.js b/test/unit/sanitizer/_tokenizer.js
index e5becebb..04dd9f5b 100644
--- a/test/unit/sanitizer/_tokenizer.js
+++ b/test/unit/sanitizer/_tokenizer.js
@@ -51,6 +51,41 @@ module.exports.tests.sanity_checks = function(test, common) {
 
     t.end();
   });
+
+  test('just a comma - should error', function(t) {
+
+    var clean = { text: ',' };
+    var messages = sanitizer.sanitize({}, clean);
+
+    // no tokens produced
+    t.deepEquals(clean.tokens, [], 'no tokens');
+    t.deepEquals(clean.tokens_complete, [], 'no tokens');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // helpful error message
+    t.deepEquals(messages.errors, ['invalid `text` input: must contain more than just delimiters'], 'error produced');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+
+  test('several commas - should error', function(t) {
+
+    var clean = { text: ',,,\\\/   ,,' };
+    var messages = sanitizer.sanitize({}, clean);
+
+    // no tokens produced
+    t.deepEquals(clean.tokens, [], 'no tokens');
+    t.deepEquals(clean.tokens_complete, [], 'no tokens');
+    t.deepEquals(clean.tokens_incomplete, [], 'no tokens');
+
+    // helpful error message
+    t.deepEquals(messages.errors, ['invalid `text` input: must contain more than just delimiters'], 'error produced');
+    t.deepEquals(messages.warnings, [], 'no warnings');
+
+    t.end();
+  });
+
   test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) {
 
     var clean = { parsed_text: { text: {} } };