From 7e4559fdc22186f5e99d530a3120b6ecb01b546b Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Mon, 15 Oct 2018 23:30:35 -0400 Subject: [PATCH] fix(sanitizer): Trim whitespace in addressit queries This is a followup PR to https://github.com/pelias/api/pull/1171 and https://github.com/pelias/api/pull/1170. Apparently we have two different `text` sanitizers, and autocomplete queries were treating a single space as valid input. This had a particularly bad outcome as it would end up generating queries (see an [example](https://gist.github.com/orangejulius/2cc26c7eed39311b6eaf1fb0175c13e6)) that had no main query clause. This caused them to match basically every document in the index. Looking at the geocode.earth slowlog, these queries took **__8 seconds per shard__**. --- sanitizer/_text_addressit.js | 8 ++++++-- test/unit/sanitizer/_text_addressit.js | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/sanitizer/_text_addressit.js b/sanitizer/_text_addressit.js index 6cf31f4d..81f7cc7c 100644 --- a/sanitizer/_text_addressit.js +++ b/sanitizer/_text_addressit.js @@ -3,6 +3,9 @@ var parser = require('addressit'); var _ = require('lodash'); var logger = require('pelias-logger').get('api'); +// ref: https://en.wikipedia.org/wiki/Quotation_mark +const QUOTES = `"'«»‘’‚‛“”„‟‹›⹂「」『』〝〞〟﹁﹂﹃﹄"'「」`; + // validate texts, convert types and apply defaults function _sanitize( raw, clean ){ @@ -10,7 +13,8 @@ function _sanitize( raw, clean ){ var messages = { errors: [], warnings: [] }; // invalid input 'text' - if( !check.nonEmptyString( raw.text ) ){ + const text = _.trim( _.trim( raw.text ), QUOTES ); + if( !check.nonEmptyString( text ) ){ messages.errors.push('invalid param \'text\': text length, must be >0'); } @@ -18,7 +22,7 @@ function _sanitize( raw, clean ){ else { // valid text - clean.text = raw.text; + clean.text = text; clean.parser = 'addressit'; // remove anything that may have been parsed before diff --git a/test/unit/sanitizer/_text_addressit.js b/test/unit/sanitizer/_text_addressit.js index 5fad89a8..6db5472b 100644 --- a/test/unit/sanitizer/_text_addressit.js +++ b/test/unit/sanitizer/_text_addressit.js @@ -339,6 +339,20 @@ module.exports.tests.text_parser = function(test, common) { }); + test('whitespace-only input counts as empty', (t) => { + const raw = { text: ' ' }; + const clean = {}; + + const expected_clean = {}; + + const messages = sanitizer.sanitize(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, ['invalid param \'text\': text length, must be >0']); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + }); + test('return an array of expected parameters in object form for validation', (t) => { const expected = [{ name: 'text' }]; const validParameters = sanitizer.expected();