diff --git a/sanitiser/_tokenizer.js b/sanitiser/_tokenizer.js index 7b8e234c..3312ea05 100644 --- a/sanitiser/_tokenizer.js +++ b/sanitiser/_tokenizer.js @@ -27,9 +27,26 @@ function sanitize( raw, clean ){ // if the text parser has run then we only tokenize the 'name' section // of the 'parsed_text' object, ignoring the 'admin' parts. - if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ + if( clean.hasOwnProperty('parsed_text') ) { inputParserRanSuccessfully = true; - text = clean.parsed_text.name; // use this string instead + + // parsed_text.name is set, this is the highest priority, use this string + if( clean.parsed_text.hasOwnProperty('name') ){ + text = clean.parsed_text.name; // use this string instead + } + + // else handle the case where parsed_text.street was produced but + // no parsed_text.name is produced. + // additionally, handle the case where parsed_text.number is present + // note: the addressit module may also produce parsed_text.unit info + // for now, we discard that information as we don't have an appropriate + else if( clean.parsed_text.hasOwnProperty('street') ){ + text = [ + clean.parsed_text.number, + clean.parsed_text.street + ].filter(function(el){return el;}) + .join(' '); // remove empty elements + } } // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors. diff --git a/test/unit/sanitiser/_tokenizer.js b/test/unit/sanitiser/_tokenizer.js index a7c6ced4..8837d4ab 100644 --- a/test/unit/sanitiser/_tokenizer.js +++ b/test/unit/sanitiser/_tokenizer.js @@ -81,6 +81,38 @@ module.exports.tests.sanity_checks = function(test, common) { t.deepEquals(messages.errors, [], 'no errors'); t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + }); + test('favor clean.parsed_text street data over clean.text', function(t) { + + var clean = { parsed_text: { number: '190', street: 'foo st' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over clean.text + t.deepEquals(clean.tokens, [ '190', 'foo', 'st' ], 'use street name + number'); + t.deepEquals(clean.tokens_complete, [ '190', 'foo', 'st' ], 'use street name + number'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text.name over clean.parsed_text street data', function(t) { + + var clean = { parsed_text: { number: '190', street: 'foo st', name: 'foo' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over all other variables + t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); }); };