From b1107a0c8fa7937fff3974222efbb7a7d0bf73d0 Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Thu, 18 Oct 2018 10:32:27 -0400 Subject: [PATCH] fix(autocomplete): detect the case where input text is unsubstantial It's possible for the `text` input to /v1/autocomplete to be of non-zero length after trimming whitespace and quotes, but still be insufficient to use for geocoding. One common case is that it contains only commas, slashes, or other delimiters. Our query logic currently does not handle this case, and will generate Elasticsearch queries that do not have a primary `must` clause and end up searching every document in the index. These queries are slow, take up cluster resources, and are not useful. By detecting unsubstantial inputs, we can prevent this. --- sanitizer/_tokenizer.js | 6 ++++++ test/unit/sanitizer/_tokenizer.js | 35 +++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index 081a9121..2be5eeee 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -62,6 +62,9 @@ function _sanitize( raw, clean ){ clean.tokens = text .split(/[\s,\\\/]+/) // split on delimeters .filter(function(el){return el;}); // remove empty elements + } else { + // text is empty, this sanitizer should be a no-op + return messages; } /** @@ -96,6 +99,9 @@ function _sanitize( raw, clean ){ } } + } else { + // set error if no substantial tokens were found + messages.errors.push('invalid `text` input: must contain more than just delimiters'); } return messages; diff --git a/test/unit/sanitizer/_tokenizer.js b/test/unit/sanitizer/_tokenizer.js index e5becebb..04dd9f5b 100644 --- a/test/unit/sanitizer/_tokenizer.js +++ b/test/unit/sanitizer/_tokenizer.js @@ -51,6 +51,41 @@ module.exports.tests.sanity_checks = function(test, common) { t.end(); }); + + test('just a comma - should error', function(t) { + + var clean = { text: ',' }; + var messages = sanitizer.sanitize({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // helpful error message + t.deepEquals(messages.errors, ['invalid `text` input: must contain more than just delimiters'], 'error produced'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + + test('several commas - should error', function(t) { + + var clean = { text: ',,,\\\/ ,,' }; + var messages = sanitizer.sanitize({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // helpful error message + t.deepEquals(messages.errors, ['invalid `text` input: must contain more than just delimiters'], 'error produced'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) { var clean = { parsed_text: { text: {} } };