From bd3b8e7bbed7acdf1b4266cc122c66390e818b0b Mon Sep 17 00:00:00 2001 From: Stephen Hess Date: Tue, 19 Apr 2016 21:38:56 -0400 Subject: [PATCH] switched to pelias-text-analyzer package since that responsibility has been moved to there --- helper/text_parser.js | 86 ------------------ package.json | 2 +- sanitiser/_text.js | 4 +- test/unit/helper/text_parser.js | 150 -------------------------------- test/unit/query/autocomplete.js | 1 - test/unit/query/search.js | 8 +- test/unit/run.js | 1 - test/unit/sanitiser/search.js | 4 +- 8 files changed, 9 insertions(+), 247 deletions(-) delete mode 100644 helper/text_parser.js delete mode 100644 test/unit/helper/text_parser.js diff --git a/helper/text_parser.js b/helper/text_parser.js deleted file mode 100644 index 0db8bede..00000000 --- a/helper/text_parser.js +++ /dev/null @@ -1,86 +0,0 @@ - -var parser = require('addressit'); -var extend = require('extend'); -var type_mapping = require('../helper/type_mapping'); -var check = require('check-types'); -var logger = require('pelias-logger').get('api'); - -var DELIM = ','; - -/* - * For performance, and to prefer POI and admin records, express a preference - * to only search coarse layers on very short text inputs. - */ -module.exports.get_layers = function get_layers(query) { - if (query.length <= 3 ) { - // no address parsing required - return type_mapping.layer_mapping.coarse; - } -}; - -module.exports.get_parsed_address = function get_parsed_address(query) { - - var getAdminPartsBySplittingOnDelim = function(queryParts) { - // naive approach - for admin matching during query time - // split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny' - - var address = {}; - - if (queryParts.length > 1) { - address.name = queryParts[0].trim(); - - // 1. slice away all parts after the first one - // 2. trim spaces from each part just in case - // 3. join the parts back together with appropriate delimiter and spacing - address.admin_parts = queryParts.slice(1) - .map(function (part) { return part.trim(); }) - .join(DELIM + ' '); - } - - return address; - }; - - var getAddressParts = function(query) { - // perform full address parsing - // except on queries so short they obviously can't contain an address - if (query.length > 3) { - return parser( query ); - } - }; - - var queryParts = query.split(DELIM); - - var addressWithAdminParts = getAdminPartsBySplittingOnDelim(queryParts); - var addressWithAddressParts= getAddressParts(queryParts.join(DELIM + ' ')); - - var parsedAddress = extend(addressWithAdminParts, - addressWithAddressParts); - - var address_parts = [ 'name', - 'number', - 'street', - 'city', - 'state', - 'country', - 'postalcode', - 'regions', - 'admin_parts' - ]; - - var parsed_text = {}; - - address_parts.forEach(function(part){ - if (parsedAddress[part]) { - parsed_text[part] = parsedAddress[part]; - } - }); - - // if all we found was regions, ignore it as it is not enough information to make smarter decisions - if (Object.keys(parsed_text).length === 1 && !check.undefined(parsed_text.regions)) - { - logger.info('Ignoring address parser output, regions only'); - return null; - } - - return parsed_text; -}; diff --git a/package.json b/package.json index 0c0cc1f7..fc511657 100644 --- a/package.json +++ b/package.json @@ -35,7 +35,6 @@ "elasticsearch": ">=1.2.1" }, "dependencies": { - "addressit": "git://github.com/dianashk/addressit.git#temp", "async": "^1.5.2", "check-types": "^6.0.0", "cluster2": "git://github.com/missinglink/cluster2.git#node_zero_twelve", @@ -56,6 +55,7 @@ "pelias-model": "^3.1.0", "pelias-query": "6.2.0", "pelias-suggester-pipeline": "2.0.4", + "pelias-text-analyzer": "^1.0.1", "stats-lite": "1.0.3", "through2": "2.0.1" }, diff --git a/sanitiser/_text.js b/sanitiser/_text.js index e6897a5e..4709eeee 100644 --- a/sanitiser/_text.js +++ b/sanitiser/_text.js @@ -1,5 +1,5 @@ var check = require('check-types'), - text_parser = require('../helper/text_parser'); + text_analyzer = require('pelias-text-analyzer'); // validate texts, convert types and apply defaults function sanitize( raw, clean ){ @@ -19,7 +19,7 @@ function sanitize( raw, clean ){ clean.text = raw.text; // parse text with query parser - var parsed_text = text_parser.get_parsed_address(clean.text); + var parsed_text = text_analyzer.parse(clean.text); if (check.assigned(parsed_text)) { clean.parsed_text = parsed_text; } diff --git a/test/unit/helper/text_parser.js b/test/unit/helper/text_parser.js deleted file mode 100644 index ca5b05f0..00000000 --- a/test/unit/helper/text_parser.js +++ /dev/null @@ -1,150 +0,0 @@ -var parser = require('../../../helper/text_parser'); - -var type_mapping = require('../../../helper/type_mapping'); -var layers_map = type_mapping.layer_mapping; - -module.exports.tests = {}; - -module.exports.tests.interface = function(test, common) { - test('interface', function(t) { - t.equal(typeof parser.get_parsed_address, 'function', 'valid function'); - t.equal(typeof parser.get_layers, 'function', 'valid function'); - t.end(); - }); -}; - -module.exports.tests.split_on_comma = function(test, common) { - var queries = [ - { name: 'soho', admin_parts: 'new york' }, - { name: 'chelsea', admin_parts: 'london' }, - { name: '123 main', admin_parts: 'new york' } - ]; - - queries.forEach(function (query) { - test('naive parsing ' + query, function(t) { - var address = parser.get_parsed_address(query.name + ', ' + query.admin_parts); - - t.equal(typeof address, 'object', 'valid object'); - t.equal(address.name, query.name, 'name set correctly to ' + address.name); - t.equal(address.admin_parts, query.admin_parts, 'admin_parts set correctly to ' + address.admin_parts); - t.end(); - }); - - test('naive parsing ' + query + 'without spaces', function(t) { - var address = parser.get_parsed_address(query.name + ',' + query.admin_parts); - - t.equal(typeof address, 'object', 'valid object'); - t.equal(address.name, query.name, 'name set correctly to ' + address.name); - t.equal(address.admin_parts, query.admin_parts, 'admin_parts set correctly to ' + address.admin_parts); - t.end(); - }); - }); -}; - -module.exports.tests.parse_three_chars_or_less = function(test, common) { - var chars_queries = ['a', 'bb', 'ccc']; - var num_queries = ['1', '12', '123']; - var alphanum_q = ['a1', '1a2', '12c']; - - var queries = chars_queries.concat(num_queries).concat(alphanum_q); - queries.forEach(function(query) { - test('query length < 3 (' + query + ')', function(t) { - var address = parser.get_parsed_address(query); - var target_layer = layers_map.coarse; - var layers = parser.get_layers(query); - - t.equal(typeof address, 'object', 'valid object'); - t.deepEqual(layers, target_layer, 'admin_parts set correctly to ' + target_layer.join(', ')); - t.end(); - }); - }); -}; - -module.exports.tests.parse_one_token = function(test, common) { - test('query with one token', function (t) { - var address = parser.get_parsed_address('yugolsavia'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with two tokens, no numbers', function (t) { - var address = parser.get_parsed_address('small town'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with two tokens, number first', function (t) { - var address = parser.get_parsed_address('123 main'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with two tokens, number second', function (t) { - var address = parser.get_parsed_address('main 123'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); - test('query with many tokens', function(t) { - var address = parser.get_parsed_address('main particle new york'); - t.equal(address, null, 'nothing address specific detected'); - t.end(); - }); -}; - -module.exports.tests.parse_address = function(test, common) { - test('valid address, house number', function(t) { - var query_string = '123 main st new york ny'; - var address = parser.get_parsed_address(query_string); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.number, '123', 'parsed house number'); - t.equal(address.street, 'main st', 'parsed street'); - t.deepEqual(address.regions, ['new york'], 'parsed city'); - t.equal(address.state , 'NY', 'parsed state'); - t.end(); - }); - test('valid address, zipcode', function(t) { - var query_string = '123 main st new york ny 10010'; - var address = parser.get_parsed_address(query_string); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.number, '123', 'parsed house number'); - t.equal(address.street, 'main st', 'parsed street'); - t.deepEqual(address.regions, ['new york'], 'parsed city'); - t.equal(address.state , 'NY', 'parsed state'); - t.equal(address.postalcode, '10010', 'parsed zip is a string'); - t.end(); - }); - test('valid address with leading 0s in zipcode', function(t) { - var query_string = '339 W Main St, Cheshire, 06410'; - var address = parser.get_parsed_address(query_string); - - console.log(address); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.street, 'W Main St', 'parsed street'); - t.deepEqual(address.regions, ['Cheshire'], 'parsed city'); - t.equal(address.postalcode, '06410', 'parsed zip'); - t.end(); - }); - test('valid address without spaces after commas', function(t) { - var query_string = '339 W Main St,Lancaster,PA'; - var address = parser.get_parsed_address(query_string); - - t.equal(typeof address, 'object', 'valid object for the address'); - t.equal(address.number, '339', 'parsed house number'); - t.equal(address.street, 'W Main St', 'parsed street'); - t.deepEqual(address.regions, ['Lancaster'], 'parsed city'); - t.deepEqual(address.state, 'PA', 'parsed state'); - t.end(); - }); -}; - - -module.exports.all = function (tape, common) { - - function test(name, testFunction) { - return tape('QUERY PARSING: ' + name, testFunction); - } - - for( var testCase in module.exports.tests ){ - module.exports.tests[testCase](test, common); - } -}; diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index 0e09457b..f4b44ffa 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -1,6 +1,5 @@ var generate = require('../../../query/autocomplete'); -var parser = require('../../../helper/text_parser'); module.exports.tests = {}; diff --git a/test/unit/query/search.js b/test/unit/query/search.js index e503311b..426eb2bc 100644 --- a/test/unit/query/search.js +++ b/test/unit/query/search.js @@ -1,5 +1,5 @@ var generate = require('../../../query/search'); -var parser = require('../../../helper/text_parser'); +var text_analyzer = require('pelias-text-analyzer'); module.exports.tests = {}; @@ -128,7 +128,7 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: address, layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, - parsed_text: parser.get_parsed_address(address), + parsed_text: text_analyzer.parse(address), }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -143,7 +143,7 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: partial_address, layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, - parsed_text: parser.get_parsed_address(partial_address), + parsed_text: text_analyzer.parse(partial_address), }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -158,7 +158,7 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: partial_address, layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, - parsed_text: parser.get_parsed_address(partial_address), + parsed_text: text_analyzer.parse(partial_address), }); var compiled = JSON.parse( JSON.stringify( query ) ); diff --git a/test/unit/run.js b/test/unit/run.js index 6f64f2e6..94d9ebb3 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -19,7 +19,6 @@ var tests = [ require('./helper/labelGenerator_GBR'), require('./helper/labelGenerator_USA'), require('./helper/labelSchema'), - require('./helper/text_parser'), require('./helper/type_mapping'), require('./helper/sizeCalculator'), require('./middleware/confidenceScore'), diff --git a/test/unit/sanitiser/search.js b/test/unit/sanitiser/search.js index e09672ce..864195a8 100644 --- a/test/unit/sanitiser/search.js +++ b/test/unit/sanitiser/search.js @@ -1,6 +1,6 @@ var extend = require('extend'), search = require('../../../sanitiser/search'), - parser = require('../../../helper/text_parser'), + text_analyzer = require('pelias-text-analyzer'), sanitize = search.sanitize, middleware = search.middleware, defaultError = 'invalid param \'text\': text length, must be >0'; @@ -80,7 +80,7 @@ module.exports.tests.sanitize_text_with_delim = function(test, common) { sanitize( req, function( ){ var expected_text = text; - var expected_parsed_text = parser.get_parsed_address(text); + var expected_parsed_text = text_analyzer.parse(text); t.equal(req.errors[0], undefined, 'no error'); t.equal(req.clean.parsed_text.name, expected_parsed_text.name, 'clean name set correctly'); t.equal(req.clean.text, expected_text, 'text should match');