Tuesday 24 April 2012
Extract data from web page
In order to retrieve a web page, do as following:
function main(env, args) { var link = 'http://astore.amazon.com/paesia-20?node=22'; var timeout = 60000; // milisecond var doc = env.newJsoup().parse(env.newURL(link), timeout); }
'doc' variable contains object belonging to Document
With 'doc' variable, data can be extracted as following:
function main(env, args) { var link = 'http://astore.amazon.com/paesia-20?node=22'; var timeout = 60000; // milisecond var doc = env.newJsoup().parse(env.newURL(link), timeout); var elements = doc.select('#searchbrowse a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var title = element.text(); var url = element.attr('href'); env.info('Title: ' + title + '\nUrl: ' + url); } }
'elements' variable is object of Elements class.
'element' variable is object of Element class.
Create basic spider
Each spider is a javascript file containing one main() function
function main(env, args) { var links = args.get('links'); links.add('Hello world!'); }
'env' is object of Machine class.
'args' is object of HashMap class.
'args' contains value with 'links' key which is object of ArrayList class. 'links' object will be printed to log after spider finish running.
In spider script, object which belongs to java classes is accessed as in java language. Some methods having return value, paramater which is object of unsupported classes is called with errors. This prevent spider from accessing some restrict resource like file system. Spider is runned in sandbox which is safe for system.Grab products from Amazon aStores
This spider grab products from Amazon aStores and save to Lucene indexes.
function main(env, args) { var astore = 'paesia'; var node = ''; var frompage = 1; var topage = 10000; var batch = 5; var cache = true; if (node.length == 0) { if (!cache) { clearCategoryMarks(env); } var nodelist = loadCategories(env); for (var i = 0; i < nodelist.size(); i++) { node = nodelist.get(i); for (var no = frompage; no <= topage; no += batch) { var min = no; var max = no + batch - 1; if (max > topage) max = topage; var products = grabProduct(astore, node, min, max, env); if (products.size() == 0) break; for (var i = 0; i < products.size(); i++) { var pro = products.get(i); saveProduct(pro, env); } env.info('Saved: ' + products.size()); } env.info('Saved all from category: ' + node); markCategory(node, env); } } else { for (var no = frompage; no <= topage; no += batch) { var min = no; var max = no + batch - 1; if (max > topage) max = topage; var products = grabProduct(astore, node, min, max, env); if (products.size() == 0) break; for (var i = 0; i < products.size(); i++) { var pro = products.get(i); saveProduct(pro, env); } env.info('Saved: ' + products.size()); } } } function clearCategoryMarks(env) { var entity = env.newEntity(); var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE); for (var i = 0; i < results.size(); i++) { results.get(i).setMark(''); results.get(i).save(); } } function markCategory(node, env) { var cat = env.newEntity(); var results = cat.search('Category_Amazon', cat.newTermQuery(cat.newTerm('node', node)), 1); if (results.size() == 0) return; cat = results.get(0); cat.setMark('crawled'); cat.save(); } function loadCategories(env) { var tag = env.newArrayList(); var entity = env.newEntity(); var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE); for (var i = 0; i < results.size(); i++) { if (results.get(i).getMark() == 'crawled') continue; tag.add(results.get(i).getString('node')); } return tag; } function saveLink(title, url, desc, env) { if (findLinkByUrl(url, env)) return; var schema = 's|url|a|title|a|desc'; var entity = env.newEntity(); entity.setSchema(schema); entity.setKind('Link'); entity.setId(env.uniqid()); entity.setString('url', url); entity.setString('title', title); entity.setString('desc', desc); entity.save(); } function findLinkByUrl(url, env) { var entity = env.newEntity(); var query = entity.newTermQuery(entity.newTerm('url', url)); var size = entity.count('Link', query, 1); return (size > 0); } function saveProduct(pro, env) { var title = pro.get('title'); var url = pro.get('url'); if (title == null || title.length == 0 || url == null || url.length == 0) return; var desc = pro.get('description') + ''; if (desc == null) desc = ''; if (desc.length > 0) { var doc = env.newJsoup().parse(desc); desc = doc.select('body').first().text(); } saveLink(title, url, desc, env); } function grabProduct(astore, node, frompage, topage, env) { var tag = env.newArrayList(); for (var no = frompage; no <= topage; no++) { try { var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no); var doc = env.newJsoup().parse(alink, 60000); var elements = doc.select('#featuredProducts .textrow a'); var map = env.newHashMap(); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var title = element.text(); var url = element.attr('href'); var pos = url.lastIndexOf('/detail/'); if (pos < 0) continue; var code = url.substring(pos + 8); var url = env.newURL(alink, url) + ''; var item = env.newHashMap(); item.put('code', code); item.put('title', title); item.put('url', url); map.put(code, item); } elements = doc.select('#featuredProducts .imagerow a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = element.attr('href'); var pos = url.lastIndexOf('/detail/'); if (pos < 0) continue; var code = url.substring(pos + 8); var item = map.get(code); if (item == null) continue; var child = element.select('img').first(); if (child == null) continue; var title = child.attr('alt'); var smimg = child.attr('src'); if (title.length() > 0) { item.put('title', title); } item.put('small-image', smimg); } var keys = env.getKeys(map); for (var i = 0; i < keys.size(); i++) { try { var item = map.get(keys.get(i)); alink = env.newURL(item.get('url')); doc = env.newJsoup().parse(alink, 60000); var element = doc.select('#detailImage img').first(); if (element != null) { item.put('large-image', element.attr('src')); } element = doc.select('#productDescription').first(); if (element != null) { var desc = element.html(); var pattern = '<h2>Product Description</h2>'; var pos = desc.indexOf(pattern); if (pos >= 0) { desc = desc.substring(pos + pattern.length); } var bdoc = env.newJsoup().parse(desc, item.get('url')); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('description', desc); } } element = doc.select('#productDetails').first(); if (element != null) { var desc = element.html(); var pattern = '<h2>Product Details</h2>'; var pos = desc.indexOf(pattern); if (pos >= 0) { desc = desc.substring(pos + pattern.length); } var bdoc = env.newJsoup().parse(desc, item.get('url')); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('details', desc); } } element = doc.select('#editorialReviews').first(); if (element != null) { var desc = element.html(); var bdoc = env.newJsoup().parse(desc, item.get('url') + ''); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('editorial-reviews', desc); } } element = doc.select('#detailListPrice').first(); if (element != null) { item.put('list-price', element.text()); } element = doc.select('#detailOfferPrice').first(); if (element != null) { item.put('offer-price', element.text()); } element = doc.select('#addToCartForm a').first(); if (element != null) { item.put('buy-url', element.attr('href')); } env.info(node + ' : ' + no + ' : ' + (i + 1) + ' : ' + item.get('url')); } catch (e) { env.error(e); } } for (var i = 0; i < keys.size(); i++) { tag.add(map.get(keys.get(i))); } } catch (e) { env.error(e); } } return tag; } function buildURL(doc, baseUrl, env) { baseUrl = env.newURL(baseUrl); var elements = doc.select('a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = env.newURL(baseUrl, element.attr('href')); element.attr('href', url + ''); } elē²ments = doc.select('img'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = env.newURL(baseUrl, element.attr('src')); element.attr('src', url + ''); } }
Grab categories from Amazon aStores
This spider grab categories from Amazon aStores and save to Lucene indexes.
function main(env, args) { var astore = 'paesia'; try { var categories = grabCategory(astore, env); var map = env.newHashMap(); for (var i = 0; i < categories.size(); i++) { var cat = categories.get(i); map.put(cat.get('node'), cat); } for (var i = 0; i < categories.size(); i++) { var cat = categories.get(i); var node = cat.get('node'); var title = cat.get('title'); var parent = cat.get('parent'); saveCategory(title, node, parent, env); } } catch (e) { env.error(e); } } function saveCategory(title, node, parent, env) { if (findCategoryByNode(node, env)) return; var schema = 's|node|s|title|s|parent'; var entity = env.newEntity(); entity.setSchema(schema); entity.setKind('Category_Amazon'); entity.setId(env.uniqid()); entity.setString('node', node); entity.setString('title', title); entity.setString('parent', parent); entity.save(); } function findCategoryByNode(node, env) { var entity = env.newEntity(); var query = entity.newTermQuery(entity.newTerm('node', node)); var size = entity.count('Category_Amazon', query, 1); return (size > 0); } function grabCategory(astore, env) { var tag = env.newArrayList(); try { var nodelist = env.newArrayList(); var alink = env.newURL('http://astore.amazon.com/' + astore + '-20'); var doc = env.newJsoup().parse(alink, 60000); var elements = doc.select('#searchbrowse a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var title = element.text(); var url = element.attr('href'); var pos = url.lastIndexOf('node='); if (pos < 0) continue; var node = url.substring(pos + 5); pos = node.indexOf('&'); if (pos >= 0) { node = node.substring(0, pos); } var item = env.newHashMap(); item.put('title', title); item.put('node', node); item.put('parent', ''); tag.add(item); nodelist.add(node); env.info(node + ' : ' + title); } var no = 0; while (no < nodelist.size()) { alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + nodelist.get(no)); doc = env.newJsoup().parse(alink, 60000); elements = doc.select('#searchbrowse .indent a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var title = element.text(); var url = element.attr('href'); var pos = url.lastIndexOf('node='); if (pos < 0) continue; var node = url.substring(pos + 5); pos = node.indexOf('&'); if (pos >= 0) { node = node.substring(0, pos); } if (nodelist.indexOf(node) >= 0) continue; var item = env.newHashMap(); item.put('title', title); item.put('node', node); item.put('parent', nodelist.get(no)); tag.add(item); nodelist.add(node); env.info(node + ' : ' + title); } no++; } } catch (e) { env.error(e); } return tag; }
Subscribe to:
Posts (Atom)