This spider grab categories from Amazon aStores and save to Lucene indexes.
function main(env, args) {
var astore = 'paesia';
try {
var categories = grabCategory(astore, env);
var map = env.newHashMap();
for (var i = 0; i < categories.size(); i++) {
var cat = categories.get(i);
map.put(cat.get('node'), cat);
}
for (var i = 0; i < categories.size(); i++) {
var cat = categories.get(i);
var node = cat.get('node');
var title = cat.get('title');
var parent = cat.get('parent');
saveCategory(title, node, parent, env);
}
} catch (e) {
env.error(e);
}
}
function saveCategory(title, node, parent, env) {
if (findCategoryByNode(node, env)) return;
var schema = 's|node|s|title|s|parent';
var entity = env.newEntity();
entity.setSchema(schema);
entity.setKind('Category_Amazon');
entity.setId(env.uniqid());
entity.setString('node', node);
entity.setString('title', title);
entity.setString('parent', parent);
entity.save();
}
function findCategoryByNode(node, env) {
var entity = env.newEntity();
var query = entity.newTermQuery(entity.newTerm('node', node));
var size = entity.count('Category_Amazon', query, 1);
return (size > 0);
}
function grabCategory(astore, env) {
var tag = env.newArrayList();
try {
var nodelist = env.newArrayList();
var alink = env.newURL('http://astore.amazon.com/' + astore + '-20');
var doc = env.newJsoup().parse(alink, 60000);
var elements = doc.select('#searchbrowse a');
for (var i = 0; i < elements.size(); i++) {
var element = elements.get(i);
var title = element.text();
var url = element.attr('href');
var pos = url.lastIndexOf('node=');
if (pos < 0) continue;
var node = url.substring(pos + 5);
pos = node.indexOf('&');
if (pos >= 0) {
node = node.substring(0, pos);
}
var item = env.newHashMap();
item.put('title', title);
item.put('node', node);
item.put('parent', '');
tag.add(item);
nodelist.add(node);
env.info(node + ' : ' + title);
}
var no = 0;
while (no < nodelist.size()) {
alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + nodelist.get(no));
doc = env.newJsoup().parse(alink, 60000);
elements = doc.select('#searchbrowse .indent a');
for (var i = 0; i < elements.size(); i++) {
var element = elements.get(i);
var title = element.text();
var url = element.attr('href');
var pos = url.lastIndexOf('node=');
if (pos < 0) continue;
var node = url.substring(pos + 5);
pos = node.indexOf('&');
if (pos >= 0) {
node = node.substring(0, pos);
}
if (nodelist.indexOf(node) >= 0) continue;
var item = env.newHashMap();
item.put('title', title);
item.put('node', node);
item.put('parent', nodelist.get(no));
tag.add(item);
nodelist.add(node);
env.info(node + ' : ' + title);
}
no++;
}
} catch (e) {
env.error(e);
}
return tag;
}
No comments:
Post a Comment