Browse Source

Merge pull request #520 from pelias/switch-to-text-analyzer-package

switched to pelias-text-analyzer package since that responsibility ha…
pull/522/head
Stephen K Hess 9 years ago
parent
commit
986c340f5b
  1. 86
      helper/text_parser.js
  2. 2
      package.json
  3. 4
      sanitiser/_text.js
  4. 150
      test/unit/helper/text_parser.js
  5. 1
      test/unit/query/autocomplete.js
  6. 8
      test/unit/query/search.js
  7. 1
      test/unit/run.js
  8. 4
      test/unit/sanitiser/search.js

86
helper/text_parser.js

@ -1,86 +0,0 @@
var parser = require('addressit');
var extend = require('extend');
var type_mapping = require('../helper/type_mapping');
var check = require('check-types');
var logger = require('pelias-logger').get('api');
var DELIM = ',';
/*
* For performance, and to prefer POI and admin records, express a preference
* to only search coarse layers on very short text inputs.
*/
module.exports.get_layers = function get_layers(query) {
if (query.length <= 3 ) {
// no address parsing required
return type_mapping.layer_mapping.coarse;
}
};
module.exports.get_parsed_address = function get_parsed_address(query) {
var getAdminPartsBySplittingOnDelim = function(queryParts) {
// naive approach - for admin matching during query time
// split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny'
var address = {};
if (queryParts.length > 1) {
address.name = queryParts[0].trim();
// 1. slice away all parts after the first one
// 2. trim spaces from each part just in case
// 3. join the parts back together with appropriate delimiter and spacing
address.admin_parts = queryParts.slice(1)
.map(function (part) { return part.trim(); })
.join(DELIM + ' ');
}
return address;
};
var getAddressParts = function(query) {
// perform full address parsing
// except on queries so short they obviously can't contain an address
if (query.length > 3) {
return parser( query );
}
};
var queryParts = query.split(DELIM);
var addressWithAdminParts = getAdminPartsBySplittingOnDelim(queryParts);
var addressWithAddressParts= getAddressParts(queryParts.join(DELIM + ' '));
var parsedAddress = extend(addressWithAdminParts,
addressWithAddressParts);
var address_parts = [ 'name',
'number',
'street',
'city',
'state',
'country',
'postalcode',
'regions',
'admin_parts'
];
var parsed_text = {};
address_parts.forEach(function(part){
if (parsedAddress[part]) {
parsed_text[part] = parsedAddress[part];
}
});
// if all we found was regions, ignore it as it is not enough information to make smarter decisions
if (Object.keys(parsed_text).length === 1 && !check.undefined(parsed_text.regions))
{
logger.info('Ignoring address parser output, regions only');
return null;
}
return parsed_text;
};

2
package.json

@ -35,7 +35,6 @@
"elasticsearch": ">=1.2.1"
},
"dependencies": {
"addressit": "git://github.com/dianashk/addressit.git#temp",
"async": "^1.5.2",
"check-types": "^6.0.0",
"cluster2": "git://github.com/missinglink/cluster2.git#node_zero_twelve",
@ -56,6 +55,7 @@
"pelias-model": "^4.0.0",
"pelias-query": "6.2.0",
"pelias-suggester-pipeline": "2.0.4",
"pelias-text-analyzer": "^1.0.1",
"stats-lite": "1.0.3",
"through2": "2.0.1"
},

4
sanitiser/_text.js

@ -1,5 +1,5 @@
var check = require('check-types'),
text_parser = require('../helper/text_parser');
text_analyzer = require('pelias-text-analyzer');
// validate texts, convert types and apply defaults
function sanitize( raw, clean ){
@ -19,7 +19,7 @@ function sanitize( raw, clean ){
clean.text = raw.text;
// parse text with query parser
var parsed_text = text_parser.get_parsed_address(clean.text);
var parsed_text = text_analyzer.parse(clean.text);
if (check.assigned(parsed_text)) {
clean.parsed_text = parsed_text;
}

150
test/unit/helper/text_parser.js

@ -1,150 +0,0 @@
var parser = require('../../../helper/text_parser');
var type_mapping = require('../../../helper/type_mapping');
var layers_map = type_mapping.layer_mapping;
module.exports.tests = {};
module.exports.tests.interface = function(test, common) {
test('interface', function(t) {
t.equal(typeof parser.get_parsed_address, 'function', 'valid function');
t.equal(typeof parser.get_layers, 'function', 'valid function');
t.end();
});
};
module.exports.tests.split_on_comma = function(test, common) {
var queries = [
{ name: 'soho', admin_parts: 'new york' },
{ name: 'chelsea', admin_parts: 'london' },
{ name: '123 main', admin_parts: 'new york' }
];
queries.forEach(function (query) {
test('naive parsing ' + query, function(t) {
var address = parser.get_parsed_address(query.name + ', ' + query.admin_parts);
t.equal(typeof address, 'object', 'valid object');
t.equal(address.name, query.name, 'name set correctly to ' + address.name);
t.equal(address.admin_parts, query.admin_parts, 'admin_parts set correctly to ' + address.admin_parts);
t.end();
});
test('naive parsing ' + query + 'without spaces', function(t) {
var address = parser.get_parsed_address(query.name + ',' + query.admin_parts);
t.equal(typeof address, 'object', 'valid object');
t.equal(address.name, query.name, 'name set correctly to ' + address.name);
t.equal(address.admin_parts, query.admin_parts, 'admin_parts set correctly to ' + address.admin_parts);
t.end();
});
});
};
module.exports.tests.parse_three_chars_or_less = function(test, common) {
var chars_queries = ['a', 'bb', 'ccc'];
var num_queries = ['1', '12', '123'];
var alphanum_q = ['a1', '1a2', '12c'];
var queries = chars_queries.concat(num_queries).concat(alphanum_q);
queries.forEach(function(query) {
test('query length < 3 (' + query + ')', function(t) {
var address = parser.get_parsed_address(query);
var target_layer = layers_map.coarse;
var layers = parser.get_layers(query);
t.equal(typeof address, 'object', 'valid object');
t.deepEqual(layers, target_layer, 'admin_parts set correctly to ' + target_layer.join(', '));
t.end();
});
});
};
module.exports.tests.parse_one_token = function(test, common) {
test('query with one token', function (t) {
var address = parser.get_parsed_address('yugolsavia');
t.equal(address, null, 'nothing address specific detected');
t.end();
});
test('query with two tokens, no numbers', function (t) {
var address = parser.get_parsed_address('small town');
t.equal(address, null, 'nothing address specific detected');
t.end();
});
test('query with two tokens, number first', function (t) {
var address = parser.get_parsed_address('123 main');
t.equal(address, null, 'nothing address specific detected');
t.end();
});
test('query with two tokens, number second', function (t) {
var address = parser.get_parsed_address('main 123');
t.equal(address, null, 'nothing address specific detected');
t.end();
});
test('query with many tokens', function(t) {
var address = parser.get_parsed_address('main particle new york');
t.equal(address, null, 'nothing address specific detected');
t.end();
});
};
module.exports.tests.parse_address = function(test, common) {
test('valid address, house number', function(t) {
var query_string = '123 main st new york ny';
var address = parser.get_parsed_address(query_string);
t.equal(typeof address, 'object', 'valid object for the address');
t.equal(address.number, '123', 'parsed house number');
t.equal(address.street, 'main st', 'parsed street');
t.deepEqual(address.regions, ['new york'], 'parsed city');
t.equal(address.state , 'NY', 'parsed state');
t.end();
});
test('valid address, zipcode', function(t) {
var query_string = '123 main st new york ny 10010';
var address = parser.get_parsed_address(query_string);
t.equal(typeof address, 'object', 'valid object for the address');
t.equal(address.number, '123', 'parsed house number');
t.equal(address.street, 'main st', 'parsed street');
t.deepEqual(address.regions, ['new york'], 'parsed city');
t.equal(address.state , 'NY', 'parsed state');
t.equal(address.postalcode, '10010', 'parsed zip is a string');
t.end();
});
test('valid address with leading 0s in zipcode', function(t) {
var query_string = '339 W Main St, Cheshire, 06410';
var address = parser.get_parsed_address(query_string);
console.log(address);
t.equal(typeof address, 'object', 'valid object for the address');
t.equal(address.street, 'W Main St', 'parsed street');
t.deepEqual(address.regions, ['Cheshire'], 'parsed city');
t.equal(address.postalcode, '06410', 'parsed zip');
t.end();
});
test('valid address without spaces after commas', function(t) {
var query_string = '339 W Main St,Lancaster,PA';
var address = parser.get_parsed_address(query_string);
t.equal(typeof address, 'object', 'valid object for the address');
t.equal(address.number, '339', 'parsed house number');
t.equal(address.street, 'W Main St', 'parsed street');
t.deepEqual(address.regions, ['Lancaster'], 'parsed city');
t.deepEqual(address.state, 'PA', 'parsed state');
t.end();
});
};
module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('QUERY PARSING: ' + name, testFunction);
}
for( var testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};

1
test/unit/query/autocomplete.js

@ -1,6 +1,5 @@
var generate = require('../../../query/autocomplete');
var parser = require('../../../helper/text_parser');
module.exports.tests = {};

8
test/unit/query/search.js

@ -1,5 +1,5 @@
var generate = require('../../../query/search');
var parser = require('../../../helper/text_parser');
var text_analyzer = require('pelias-text-analyzer');
module.exports.tests = {};
@ -128,7 +128,7 @@ module.exports.tests.query = function(test, common) {
var query = generate({ text: address,
layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ],
querySize: 10,
parsed_text: parser.get_parsed_address(address),
parsed_text: text_analyzer.parse(address),
});
var compiled = JSON.parse( JSON.stringify( query ) );
@ -143,7 +143,7 @@ module.exports.tests.query = function(test, common) {
var query = generate({ text: partial_address,
layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ],
querySize: 10,
parsed_text: parser.get_parsed_address(partial_address),
parsed_text: text_analyzer.parse(partial_address),
});
var compiled = JSON.parse( JSON.stringify( query ) );
@ -158,7 +158,7 @@ module.exports.tests.query = function(test, common) {
var query = generate({ text: partial_address,
layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ],
querySize: 10,
parsed_text: parser.get_parsed_address(partial_address),
parsed_text: text_analyzer.parse(partial_address),
});
var compiled = JSON.parse( JSON.stringify( query ) );

1
test/unit/run.js

@ -19,7 +19,6 @@ var tests = [
require('./helper/labelGenerator_GBR'),
require('./helper/labelGenerator_USA'),
require('./helper/labelSchema'),
require('./helper/text_parser'),
require('./helper/type_mapping'),
require('./helper/sizeCalculator'),
require('./middleware/confidenceScore'),

4
test/unit/sanitiser/search.js

@ -1,6 +1,6 @@
var extend = require('extend'),
search = require('../../../sanitiser/search'),
parser = require('../../../helper/text_parser'),
text_analyzer = require('pelias-text-analyzer'),
sanitize = search.sanitize,
middleware = search.middleware,
defaultError = 'invalid param \'text\': text length, must be >0';
@ -80,7 +80,7 @@ module.exports.tests.sanitize_text_with_delim = function(test, common) {
sanitize( req, function( ){
var expected_text = text;
var expected_parsed_text = parser.get_parsed_address(text);
var expected_parsed_text = text_analyzer.parse(text);
t.equal(req.errors[0], undefined, 'no error');
t.equal(req.clean.parsed_text.name, expected_parsed_text.name, 'clean name set correctly');
t.equal(req.clean.text, expected_text, 'text should match');

Loading…
Cancel
Save