This spider grab categories from Amazon aStores and save to Lucene indexes.
1 | function main(env, args) { |
2 | var astore = 'paesia'; |
3 | try { |
4 | var categories = grabCategory(astore, env); |
5 | var map = env.newHashMap(); |
6 | for (var i = 0; i < categories.size(); i++) { |
7 | var cat = categories.get(i); |
8 | map.put(cat.get('node'), cat); |
9 | } |
10 | for (var i = 0; i < categories.size(); i++) { |
11 | var cat = categories.get(i); |
12 | var node = cat.get('node'); |
13 | var title = cat.get('title'); |
14 | var parent = cat.get('parent'); |
15 | saveCategory(title, node, parent, env); |
16 | } |
17 | } catch (e) { |
18 | env.error(e); |
19 | } |
20 | } |
21 | |
22 | function saveCategory(title, node, parent, env) { |
23 | if (findCategoryByNode(node, env)) return; |
24 | var schema = 's|node|s|title|s|parent'; |
25 | var entity = env.newEntity(); |
26 | entity.setSchema(schema); |
27 | entity.setKind('Category_Amazon'); |
28 | entity.setId(env.uniqid()); |
29 | entity.setString('node', node); |
30 | entity.setString('title', title); |
31 | entity.setString('parent', parent); |
32 | entity.save(); |
33 | } |
34 | |
35 | function findCategoryByNode(node, env) { |
36 | var entity = env.newEntity(); |
37 | var query = entity.newTermQuery(entity.newTerm('node', node)); |
38 | var size = entity.count('Category_Amazon', query, 1); |
39 | return (size > 0); |
40 | } |
41 | |
42 | function grabCategory(astore, env) { |
43 | var tag = env.newArrayList(); |
44 | try { |
45 | var nodelist = env.newArrayList(); |
46 | var alink = env.newURL('http://astore.amazon.com/' + astore + '-20'); |
47 | var doc = env.newJsoup().parse(alink, 60000); |
48 | var elements = doc.select('#searchbrowse a'); |
49 | for (var i = 0; i < elements.size(); i++) { |
50 | var element = elements.get(i); |
51 | var title = element.text(); |
52 | var url = element.attr('href'); |
53 | var pos = url.lastIndexOf('node='); |
54 | if (pos < 0) continue; |
55 | var node = url.substring(pos + 5); |
56 | pos = node.indexOf('&'); |
57 | if (pos >= 0) { |
58 | node = node.substring(0, pos); |
59 | } |
60 | var item = env.newHashMap(); |
61 | item.put('title', title); |
62 | item.put('node', node); |
63 | item.put('parent', ''); |
64 | tag.add(item); |
65 | nodelist.add(node); |
66 | env.info(node + ' : ' + title); |
67 | } |
68 | var no = 0; |
69 | while (no < nodelist.size()) { |
70 | alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + nodelist.get(no)); |
71 | doc = env.newJsoup().parse(alink, 60000); |
72 | elements = doc.select('#searchbrowse .indent a'); |
73 | for (var i = 0; i < elements.size(); i++) { |
74 | var element = elements.get(i); |
75 | var title = element.text(); |
76 | var url = element.attr('href'); |
77 | var pos = url.lastIndexOf('node='); |
78 | if (pos < 0) continue; |
79 | var node = url.substring(pos + 5); |
80 | pos = node.indexOf('&'); |
81 | if (pos >= 0) { |
82 | node = node.substring(0, pos); |
83 | } |
84 | if (nodelist.indexOf(node) >= 0) continue; |
85 | var item = env.newHashMap(); |
86 | item.put('title', title); |
87 | item.put('node', node); |
88 | item.put('parent', nodelist.get(no)); |
89 | tag.add(item); |
90 | nodelist.add(node); |
91 | env.info(node + ' : ' + title); |
92 | } |
93 | no++; |
94 | } |
95 | } catch (e) { |
96 | env.error(e); |
97 | } |
98 | return tag; |
99 | } |
function main(env, args) { var astore = 'paesia'; try { var categories = grabCategory(astore, env); var map = env.newHashMap(); for (var i = 0; i < categories.size(); i++) { var cat = categories.get(i); map.put(cat.get('node'), cat); } for (var i = 0; i < categories.size(); i++) { var cat = categories.get(i); var node = cat.get('node'); var title = cat.get('title'); var parent = cat.get('parent'); saveCategory(title, node, parent, env); } } catch (e) { env.error(e); } } function saveCategory(title, node, parent, env) { if (findCategoryByNode(node, env)) return; var schema = 's|node|s|title|s|parent'; var entity = env.newEntity(); entity.setSchema(schema); entity.setKind('Category_Amazon'); entity.setId(env.uniqid()); entity.setString('node', node); entity.setString('title', title); entity.setString('parent', parent); entity.save(); } function findCategoryByNode(node, env) { var entity = env.newEntity(); var query = entity.newTermQuery(entity.newTerm('node', node)); var size = entity.count('Category_Amazon', query, 1); return (size > 0); } function grabCategory(astore, env) { var tag = env.newArrayList(); try { var nodelist = env.newArrayList(); var alink = env.newURL('http://astore.amazon.com/' + astore + '-20'); var doc = env.newJsoup().parse(alink, 60000); var elements = doc.select('#searchbrowse a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var title = element.text(); var url = element.attr('href'); var pos = url.lastIndexOf('node='); if (pos < 0) continue; var node = url.substring(pos + 5); pos = node.indexOf('&'); if (pos >= 0) { node = node.substring(0, pos); } var item = env.newHashMap(); item.put('title', title); item.put('node', node); item.put('parent', ''); tag.add(item); nodelist.add(node); env.info(node + ' : ' + title); } var no = 0; while (no < nodelist.size()) { alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + nodelist.get(no)); doc = env.newJsoup().parse(alink, 60000); elements = doc.select('#searchbrowse .indent a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var title = element.text(); var url = element.attr('href'); var pos = url.lastIndexOf('node='); if (pos < 0) continue; var node = url.substring(pos + 5); pos = node.indexOf('&'); if (pos >= 0) { node = node.substring(0, pos); } if (nodelist.indexOf(node) >= 0) continue; var item = env.newHashMap(); item.put('title', title); item.put('node', node); item.put('parent', nodelist.get(no)); tag.add(item); nodelist.add(node); env.info(node + ' : ' + title); } no++; } } catch (e) { env.error(e); } return tag; }
No comments:
Post a Comment