This spider grab products from Amazon aStores and save to Lucene indexes.
1 | function main(env, args) { |
2 | var astore = 'paesia'; |
3 | var node = ''; |
4 | var frompage = 1; |
5 | var topage = 10000; |
6 | var batch = 5; |
7 | var cache = true; |
8 | if (node.length == 0) { |
9 | if (!cache) { |
10 | clearCategoryMarks(env); |
11 | } |
12 | var nodelist = loadCategories(env); |
13 | for (var i = 0; i < nodelist.size(); i++) { |
14 | node = nodelist.get(i); |
15 | for (var no = frompage; no <= topage; no += batch) { |
16 | var min = no; |
17 | var max = no + batch - 1; |
18 | if (max > topage) max = topage; |
19 | var products = grabProduct(astore, node, min, max, env); |
20 | if (products.size() == 0) break; |
21 | for (var i = 0; i < products.size(); i++) { |
22 | var pro = products.get(i); |
23 | saveProduct(pro, env); |
24 | } |
25 | env.info('Saved: ' + products.size()); |
26 | } |
27 | env.info('Saved all from category: ' + node); |
28 | markCategory(node, env); |
29 | } |
30 | } else { |
31 | for (var no = frompage; no <= topage; no += batch) { |
32 | var min = no; |
33 | var max = no + batch - 1; |
34 | if (max > topage) max = topage; |
35 | var products = grabProduct(astore, node, min, max, env); |
36 | if (products.size() == 0) break; |
37 | for (var i = 0; i < products.size(); i++) { |
38 | var pro = products.get(i); |
39 | saveProduct(pro, env); |
40 | } |
41 | env.info('Saved: ' + products.size()); |
42 | } |
43 | } |
44 | } |
45 | |
46 | function clearCategoryMarks(env) { |
47 | var entity = env.newEntity(); |
48 | var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE); |
49 | for (var i = 0; i < results.size(); i++) { |
50 | results.get(i).setMark(''); |
51 | results.get(i).save(); |
52 | } |
53 | } |
54 | |
55 | function markCategory(node, env) { |
56 | var cat = env.newEntity(); |
57 | var results = cat.search('Category_Amazon', cat.newTermQuery(cat.newTerm('node', node)), 1); |
58 | if (results.size() == 0) return; |
59 | cat = results.get(0); |
60 | cat.setMark('crawled'); |
61 | cat.save(); |
62 | } |
63 | |
64 | function loadCategories(env) { |
65 | var tag = env.newArrayList(); |
66 | var entity = env.newEntity(); |
67 | var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE); |
68 | for (var i = 0; i < results.size(); i++) { |
69 | if (results.get(i).getMark() == 'crawled') continue; |
70 | tag.add(results.get(i).getString('node')); |
71 | } |
72 | return tag; |
73 | } |
74 | |
75 | function saveLink(title, url, desc, env) { |
76 | if (findLinkByUrl(url, env)) return; |
77 | var schema = 's|url|a|title|a|desc'; |
78 | var entity = env.newEntity(); |
79 | entity.setSchema(schema); |
80 | entity.setKind('Link'); |
81 | entity.setId(env.uniqid()); |
82 | entity.setString('url', url); |
83 | entity.setString('title', title); |
84 | entity.setString('desc', desc); |
85 | entity.save(); |
86 | } |
87 | |
88 | function findLinkByUrl(url, env) { |
89 | var entity = env.newEntity(); |
90 | var query = entity.newTermQuery(entity.newTerm('url', url)); |
91 | var size = entity.count('Link', query, 1); |
92 | return (size > 0); |
93 | } |
94 | |
95 | function saveProduct(pro, env) { |
96 | var title = pro.get('title'); |
97 | var url = pro.get('url'); |
98 | if (title == null || title.length == 0 || url == null || url.length == 0) return; |
99 | var desc = pro.get('description') + ''; |
100 | if (desc == null) desc = ''; |
101 | if (desc.length > 0) { |
102 | var doc = env.newJsoup().parse(desc); |
103 | desc = doc.select('body').first().text(); |
104 | } |
105 | saveLink(title, url, desc, env); |
106 | } |
107 | |
108 | function grabProduct(astore, node, frompage, topage, env) { |
109 | var tag = env.newArrayList(); |
110 | for (var no = frompage; no <= topage; no++) { |
111 | try { |
112 | var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no); |
113 | var doc = env.newJsoup().parse(alink, 60000); |
114 | var elements = doc.select('#featuredProducts .textrow a'); |
115 | var map = env.newHashMap(); |
116 | for (var i = 0; i < elements.size(); i++) { |
117 | var element = elements.get(i); |
118 | var title = element.text(); |
119 | var url = element.attr('href'); |
120 | var pos = url.lastIndexOf('/detail/'); |
121 | if (pos < 0) continue; |
122 | var code = url.substring(pos + 8); |
123 | var url = env.newURL(alink, url) + ''; |
124 | var item = env.newHashMap(); |
125 | item.put('code', code); |
126 | item.put('title', title); |
127 | item.put('url', url); |
128 | map.put(code, item); |
129 | } |
130 | elements = doc.select('#featuredProducts .imagerow a'); |
131 | for (var i = 0; i < elements.size(); i++) { |
132 | var element = elements.get(i); |
133 | var url = element.attr('href'); |
134 | var pos = url.lastIndexOf('/detail/'); |
135 | if (pos < 0) continue; |
136 | var code = url.substring(pos + 8); |
137 | var item = map.get(code); |
138 | if (item == null) continue; |
139 | var child = element.select('img').first(); |
140 | if (child == null) continue; |
141 | var title = child.attr('alt'); |
142 | var smimg = child.attr('src'); |
143 | if (title.length() > 0) { |
144 | item.put('title', title); |
145 | } |
146 | item.put('small-image', smimg); |
147 | } |
148 | |
149 | var keys = env.getKeys(map); |
150 | for (var i = 0; i < keys.size(); i++) { |
151 | try { |
152 | var item = map.get(keys.get(i)); |
153 | alink = env.newURL(item.get('url')); |
154 | doc = env.newJsoup().parse(alink, 60000); |
155 | var element = doc.select('#detailImage img').first(); |
156 | if (element != null) { |
157 | item.put('large-image', element.attr('src')); |
158 | } |
159 | element = doc.select('#productDescription').first(); |
160 | if (element != null) { |
161 | var desc = element.html(); |
162 | var pattern = '<h2>Product Description</h2>'; |
163 | var pos = desc.indexOf(pattern); |
164 | if (pos >= 0) { |
165 | desc = desc.substring(pos + pattern.length); |
166 | } |
167 | var bdoc = env.newJsoup().parse(desc, item.get('url')); |
168 | buildURL(bdoc, item.get('url'), env); |
169 | desc = bdoc.select('body').first().html(); |
170 | if (desc.indexOf('<html') < 0) { |
171 | item.put('description', desc); |
172 | } |
173 | } |
174 | element = doc.select('#productDetails').first(); |
175 | if (element != null) { |
176 | var desc = element.html(); |
177 | var pattern = '<h2>Product Details</h2>'; |
178 | var pos = desc.indexOf(pattern); |
179 | if (pos >= 0) { |
180 | desc = desc.substring(pos + pattern.length); |
181 | } |
182 | var bdoc = env.newJsoup().parse(desc, item.get('url')); |
183 | buildURL(bdoc, item.get('url'), env); |
184 | desc = bdoc.select('body').first().html(); |
185 | if (desc.indexOf('<html') < 0) { |
186 | item.put('details', desc); |
187 | } |
188 | } |
189 | element = doc.select('#editorialReviews').first(); |
190 | if (element != null) { |
191 | var desc = element.html(); |
192 | var bdoc = env.newJsoup().parse(desc, item.get('url') + ''); |
193 | buildURL(bdoc, item.get('url'), env); |
194 | desc = bdoc.select('body').first().html(); |
195 | if (desc.indexOf('<html') < 0) { |
196 | item.put('editorial-reviews', desc); |
197 | } |
198 | } |
199 | element = doc.select('#detailListPrice').first(); |
200 | if (element != null) { |
201 | item.put('list-price', element.text()); |
202 | } |
203 | element = doc.select('#detailOfferPrice').first(); |
204 | if (element != null) { |
205 | item.put('offer-price', element.text()); |
206 | } |
207 | element = doc.select('#addToCartForm a').first(); |
208 | if (element != null) { |
209 | item.put('buy-url', element.attr('href')); |
210 | } |
211 | env.info(node + ' : ' + no + ' : ' + (i + 1) + ' : ' + item.get('url')); |
212 | } catch (e) { |
213 | env.error(e); |
214 | } |
215 | } |
216 | |
217 | for (var i = 0; i < keys.size(); i++) { |
218 | tag.add(map.get(keys.get(i))); |
219 | } |
220 | } catch (e) { |
221 | env.error(e); |
222 | } |
223 | } |
224 | return tag; |
225 | } |
226 | |
227 | function buildURL(doc, baseUrl, env) { |
228 | baseUrl = env.newURL(baseUrl); |
229 | var elements = doc.select('a'); |
230 | for (var i = 0; i < elements.size(); i++) { |
231 | var element = elements.get(i); |
232 | var url = env.newURL(baseUrl, element.attr('href')); |
233 | element.attr('href', url + ''); |
234 | } |
235 | el粐ments = doc.select('img'); |
236 | for (var i = 0; i < elements.size(); i++) { |
237 | var element = elements.get(i); |
238 | var url = env.newURL(baseUrl, element.attr('src')); |
239 | element.attr('src', url + ''); |
240 | } |
241 | } |
function main(env, args) { var astore = 'paesia'; var node = ''; var frompage = 1; var topage = 10000; var batch = 5; var cache = true; if (node.length == 0) { if (!cache) { clearCategoryMarks(env); } var nodelist = loadCategories(env); for (var i = 0; i < nodelist.size(); i++) { node = nodelist.get(i); for (var no = frompage; no <= topage; no += batch) { var min = no; var max = no + batch - 1; if (max > topage) max = topage; var products = grabProduct(astore, node, min, max, env); if (products.size() == 0) break; for (var i = 0; i < products.size(); i++) { var pro = products.get(i); saveProduct(pro, env); } env.info('Saved: ' + products.size()); } env.info('Saved all from category: ' + node); markCategory(node, env); } } else { for (var no = frompage; no <= topage; no += batch) { var min = no; var max = no + batch - 1; if (max > topage) max = topage; var products = grabProduct(astore, node, min, max, env); if (products.size() == 0) break; for (var i = 0; i < products.size(); i++) { var pro = products.get(i); saveProduct(pro, env); } env.info('Saved: ' + products.size()); } } } function clearCategoryMarks(env) { var entity = env.newEntity(); var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE); for (var i = 0; i < results.size(); i++) { results.get(i).setMark(''); results.get(i).save(); } } function markCategory(node, env) { var cat = env.newEntity(); var results = cat.search('Category_Amazon', cat.newTermQuery(cat.newTerm('node', node)), 1); if (results.size() == 0) return; cat = results.get(0); cat.setMark('crawled'); cat.save(); } function loadCategories(env) { var tag = env.newArrayList(); var entity = env.newEntity(); var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE); for (var i = 0; i < results.size(); i++) { if (results.get(i).getMark() == 'crawled') continue; tag.add(results.get(i).getString('node')); } return tag; } function saveLink(title, url, desc, env) { if (findLinkByUrl(url, env)) return; var schema = 's|url|a|title|a|desc'; var entity = env.newEntity(); entity.setSchema(schema); entity.setKind('Link'); entity.setId(env.uniqid()); entity.setString('url', url); entity.setString('title', title); entity.setString('desc', desc); entity.save(); } function findLinkByUrl(url, env) { var entity = env.newEntity(); var query = entity.newTermQuery(entity.newTerm('url', url)); var size = entity.count('Link', query, 1); return (size > 0); } function saveProduct(pro, env) { var title = pro.get('title'); var url = pro.get('url'); if (title == null || title.length == 0 || url == null || url.length == 0) return; var desc = pro.get('description') + ''; if (desc == null) desc = ''; if (desc.length > 0) { var doc = env.newJsoup().parse(desc); desc = doc.select('body').first().text(); } saveLink(title, url, desc, env); } function grabProduct(astore, node, frompage, topage, env) { var tag = env.newArrayList(); for (var no = frompage; no <= topage; no++) { try { var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no); var doc = env.newJsoup().parse(alink, 60000); var elements = doc.select('#featuredProducts .textrow a'); var map = env.newHashMap(); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var title = element.text(); var url = element.attr('href'); var pos = url.lastIndexOf('/detail/'); if (pos < 0) continue; var code = url.substring(pos + 8); var url = env.newURL(alink, url) + ''; var item = env.newHashMap(); item.put('code', code); item.put('title', title); item.put('url', url); map.put(code, item); } elements = doc.select('#featuredProducts .imagerow a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = element.attr('href'); var pos = url.lastIndexOf('/detail/'); if (pos < 0) continue; var code = url.substring(pos + 8); var item = map.get(code); if (item == null) continue; var child = element.select('img').first(); if (child == null) continue; var title = child.attr('alt'); var smimg = child.attr('src'); if (title.length() > 0) { item.put('title', title); } item.put('small-image', smimg); } var keys = env.getKeys(map); for (var i = 0; i < keys.size(); i++) { try { var item = map.get(keys.get(i)); alink = env.newURL(item.get('url')); doc = env.newJsoup().parse(alink, 60000); var element = doc.select('#detailImage img').first(); if (element != null) { item.put('large-image', element.attr('src')); } element = doc.select('#productDescription').first(); if (element != null) { var desc = element.html(); var pattern = '<h2>Product Description</h2>'; var pos = desc.indexOf(pattern); if (pos >= 0) { desc = desc.substring(pos + pattern.length); } var bdoc = env.newJsoup().parse(desc, item.get('url')); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('description', desc); } } element = doc.select('#productDetails').first(); if (element != null) { var desc = element.html(); var pattern = '<h2>Product Details</h2>'; var pos = desc.indexOf(pattern); if (pos >= 0) { desc = desc.substring(pos + pattern.length); } var bdoc = env.newJsoup().parse(desc, item.get('url')); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('details', desc); } } element = doc.select('#editorialReviews').first(); if (element != null) { var desc = element.html(); var bdoc = env.newJsoup().parse(desc, item.get('url') + ''); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('editorial-reviews', desc); } } element = doc.select('#detailListPrice').first(); if (element != null) { item.put('list-price', element.text()); } element = doc.select('#detailOfferPrice').first(); if (element != null) { item.put('offer-price', element.text()); } element = doc.select('#addToCartForm a').first(); if (element != null) { item.put('buy-url', element.attr('href')); } env.info(node + ' : ' + no + ' : ' + (i + 1) + ' : ' + item.get('url')); } catch (e) { env.error(e); } } for (var i = 0; i < keys.size(); i++) { tag.add(map.get(keys.get(i))); } } catch (e) { env.error(e); } } return tag; } function buildURL(doc, baseUrl, env) { baseUrl = env.newURL(baseUrl); var elements = doc.select('a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = env.newURL(baseUrl, element.attr('href')); element.attr('href', url + ''); } el粐ments = doc.select('img'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = env.newURL(baseUrl, element.attr('src')); element.attr('src', url + ''); } }
No comments:
Post a Comment