Tuesday, 24 April 2012

Grab products from Amazon aStores

This spider grab products from Amazon aStores and save to Lucene indexes.

1function main(env, args) {
2 var astore = 'paesia';
3 var node = '';
4 var frompage = 1;
5 var topage = 10000;
6 var batch = 5;
7 var cache = true;
8 if (node.length == 0) {
9 if (!cache) {
10 clearCategoryMarks(env);
11 }
12 var nodelist = loadCategories(env);
13 for (var i = 0; i < nodelist.size(); i++) {
14 node = nodelist.get(i);
15 for (var no = frompage; no <= topage; no += batch) {
16 var min = no;
17 var max = no + batch - 1;
18 if (max > topage) max = topage;
19 var products = grabProduct(astore, node, min, max, env);
20 if (products.size() == 0) break;
21 for (var i = 0; i < products.size(); i++) {
22 var pro = products.get(i);
23 saveProduct(pro, env);
24 }
25 env.info('Saved: ' + products.size());
26 }
27 env.info('Saved all from category: ' + node);
28 markCategory(node, env);
29 }
30 } else {
31 for (var no = frompage; no <= topage; no += batch) {
32 var min = no;
33 var max = no + batch - 1;
34 if (max > topage) max = topage;
35 var products = grabProduct(astore, node, min, max, env);
36 if (products.size() == 0) break;
37 for (var i = 0; i < products.size(); i++) {
38 var pro = products.get(i);
39 saveProduct(pro, env);
40 }
41 env.info('Saved: ' + products.size());
42 }
43 }
44}
45
46function clearCategoryMarks(env) {
47 var entity = env.newEntity();
48 var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE);
49 for (var i = 0; i < results.size(); i++) {
50 results.get(i).setMark('');
51 results.get(i).save();
52 }
53}
54
55function markCategory(node, env) {
56 var cat = env.newEntity();
57 var results = cat.search('Category_Amazon', cat.newTermQuery(cat.newTerm('node', node)), 1);
58 if (results.size() == 0) return;
59 cat = results.get(0);
60 cat.setMark('crawled');
61 cat.save();
62}
63
64function loadCategories(env) {
65 var tag = env.newArrayList();
66 var entity = env.newEntity();
67 var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE);
68 for (var i = 0; i < results.size(); i++) {
69 if (results.get(i).getMark() == 'crawled') continue;
70 tag.add(results.get(i).getString('node'));
71 }
72 return tag;
73}
74
75function saveLink(title, url, desc, env) {
76 if (findLinkByUrl(url, env)) return;
77 var schema = 's|url|a|title|a|desc';
78 var entity = env.newEntity();
79 entity.setSchema(schema);
80 entity.setKind('Link');
81 entity.setId(env.uniqid());
82 entity.setString('url', url);
83 entity.setString('title', title);
84 entity.setString('desc', desc);
85 entity.save();
86}
87
88function findLinkByUrl(url, env) {
89 var entity = env.newEntity();
90 var query = entity.newTermQuery(entity.newTerm('url', url));
91 var size = entity.count('Link', query, 1);
92 return (size > 0);
93}
94
95function saveProduct(pro, env) {
96 var title = pro.get('title');
97 var url = pro.get('url');
98 if (title == null || title.length == 0 || url == null || url.length == 0) return;
99 var desc = pro.get('description') + '';
100 if (desc == null) desc = '';
101 if (desc.length > 0) {
102 var doc = env.newJsoup().parse(desc);
103 desc = doc.select('body').first().text();
104 }
105 saveLink(title, url, desc, env);
106}
107
108function grabProduct(astore, node, frompage, topage, env) {
109 var tag = env.newArrayList();
110 for (var no = frompage; no <= topage; no++) {
111 try {
112 var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no);
113 var doc = env.newJsoup().parse(alink, 60000);
114 var elements = doc.select('#featuredProducts .textrow a');
115 var map = env.newHashMap();
116 for (var i = 0; i < elements.size(); i++) {
117 var element = elements.get(i);
118 var title = element.text();
119 var url = element.attr('href');
120 var pos = url.lastIndexOf('/detail/');
121 if (pos < 0) continue;
122 var code = url.substring(pos + 8);
123 var url = env.newURL(alink, url) + '';
124 var item = env.newHashMap();
125 item.put('code', code);
126 item.put('title', title);
127 item.put('url', url);
128 map.put(code, item);
129 }
130 elements = doc.select('#featuredProducts .imagerow a');
131 for (var i = 0; i < elements.size(); i++) {
132 var element = elements.get(i);
133 var url = element.attr('href');
134 var pos = url.lastIndexOf('/detail/');
135 if (pos < 0) continue;
136 var code = url.substring(pos + 8);
137 var item = map.get(code);
138 if (item == null) continue;
139 var child = element.select('img').first();
140 if (child == null) continue;
141 var title = child.attr('alt');
142 var smimg = child.attr('src');
143 if (title.length() > 0) {
144 item.put('title', title);
145 }
146 item.put('small-image', smimg);
147 }
148
149 var keys = env.getKeys(map);
150 for (var i = 0; i < keys.size(); i++) {
151 try {
152 var item = map.get(keys.get(i));
153 alink = env.newURL(item.get('url'));
154 doc = env.newJsoup().parse(alink, 60000);
155 var element = doc.select('#detailImage img').first();
156 if (element != null) {
157 item.put('large-image', element.attr('src'));
158 }
159 element = doc.select('#productDescription').first();
160 if (element != null) {
161 var desc = element.html();
162 var pattern = '<h2>Product Description</h2>';
163 var pos = desc.indexOf(pattern);
164 if (pos >= 0) {
165 desc = desc.substring(pos + pattern.length);
166 }
167 var bdoc = env.newJsoup().parse(desc, item.get('url'));
168 buildURL(bdoc, item.get('url'), env);
169 desc = bdoc.select('body').first().html();
170 if (desc.indexOf('<html') < 0) {
171 item.put('description', desc);
172 }
173 }
174 element = doc.select('#productDetails').first();
175 if (element != null) {
176 var desc = element.html();
177 var pattern = '<h2>Product Details</h2>';
178 var pos = desc.indexOf(pattern);
179 if (pos >= 0) {
180 desc = desc.substring(pos + pattern.length);
181 }
182 var bdoc = env.newJsoup().parse(desc, item.get('url'));
183 buildURL(bdoc, item.get('url'), env);
184 desc = bdoc.select('body').first().html();
185 if (desc.indexOf('<html') < 0) {
186 item.put('details', desc);
187 }
188 }
189 element = doc.select('#editorialReviews').first();
190 if (element != null) {
191 var desc = element.html();
192 var bdoc = env.newJsoup().parse(desc, item.get('url') + '');
193 buildURL(bdoc, item.get('url'), env);
194 desc = bdoc.select('body').first().html();
195 if (desc.indexOf('<html') < 0) {
196 item.put('editorial-reviews', desc);
197 }
198 }
199 element = doc.select('#detailListPrice').first();
200 if (element != null) {
201 item.put('list-price', element.text());
202 }
203 element = doc.select('#detailOfferPrice').first();
204 if (element != null) {
205 item.put('offer-price', element.text());
206 }
207 element = doc.select('#addToCartForm a').first();
208 if (element != null) {
209 item.put('buy-url', element.attr('href'));
210 }
211 env.info(node + ' : ' + no + ' : ' + (i + 1) + ' : ' + item.get('url'));
212 } catch (e) {
213 env.error(e);
214 }
215 }
216
217 for (var i = 0; i < keys.size(); i++) {
218 tag.add(map.get(keys.get(i)));
219 }
220 } catch (e) {
221 env.error(e);
222 }
223 }
224 return tag;
225}
226
227function buildURL(doc, baseUrl, env) {
228 baseUrl = env.newURL(baseUrl);
229 var elements = doc.select('a');
230 for (var i = 0; i < elements.size(); i++) {
231 var element = elements.get(i);
232 var url = env.newURL(baseUrl, element.attr('href'));
233 element.attr('href', url + '');
234 }
235 el粐ments = doc.select('img');
236 for (var i = 0; i < elements.size(); i++) {
237 var element = elements.get(i);
238 var url = env.newURL(baseUrl, element.attr('src'));
239 element.attr('src', url + '');
240 }
241}
function main(env, args) {
  var astore = 'paesia';
  var node = '';
  var frompage = 1;
  var topage = 10000;
  var batch = 5;
  var cache = true;
  if (node.length == 0) {
    if (!cache) {
      clearCategoryMarks(env);
    }
    var nodelist = loadCategories(env);
    for (var i = 0; i < nodelist.size(); i++) {
      node = nodelist.get(i);
      for (var no = frompage; no <= topage; no += batch) {
        var min = no;
        var max = no + batch - 1;
        if (max > topage) max = topage;
        var products = grabProduct(astore, node, min, max, env);
        if (products.size() == 0) break;
        for (var i = 0; i < products.size(); i++) {
          var pro = products.get(i);
          saveProduct(pro, env);
        }
        env.info('Saved: ' + products.size());
      }
      env.info('Saved all from category: ' + node);
      markCategory(node, env);
    }
  } else {
    for (var no = frompage; no <= topage; no += batch) {
      var min = no;
      var max = no + batch - 1;
      if (max > topage) max = topage;
      var products = grabProduct(astore, node, min, max, env);
      if (products.size() == 0) break;
      for (var i = 0; i < products.size(); i++) {
        var pro = products.get(i);
        saveProduct(pro, env);
      }
      env.info('Saved: ' + products.size());
    }
  }
}

function clearCategoryMarks(env) {
  var entity = env.newEntity();
  var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE);
  for (var i = 0; i < results.size(); i++) {
    results.get(i).setMark('');
    results.get(i).save();
  }
}

function markCategory(node, env) {
  var cat = env.newEntity();
  var results = cat.search('Category_Amazon', cat.newTermQuery(cat.newTerm('node', node)), 1);
  if (results.size() == 0) return;
  cat = results.get(0);
  cat.setMark('crawled');
  cat.save();
}

function loadCategories(env) {
  var tag = env.newArrayList();
  var entity = env.newEntity();
  var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE);
  for (var i = 0; i < results.size(); i++) {
    if (results.get(i).getMark() == 'crawled') continue;
    tag.add(results.get(i).getString('node'));
  }
  return tag;
}

function saveLink(title, url, desc, env) {
  if (findLinkByUrl(url, env)) return;
  var schema = 's|url|a|title|a|desc';
  var entity = env.newEntity();
  entity.setSchema(schema);
  entity.setKind('Link');
  entity.setId(env.uniqid());
  entity.setString('url', url);
  entity.setString('title', title);
  entity.setString('desc', desc);
  entity.save();
}

function findLinkByUrl(url, env) {
  var entity = env.newEntity();
  var query = entity.newTermQuery(entity.newTerm('url', url));
  var size = entity.count('Link', query, 1);
  return (size > 0);
}

function saveProduct(pro, env) {
  var title = pro.get('title');
  var url = pro.get('url');
  if (title == null || title.length == 0 || url == null || url.length == 0) return;
  var desc = pro.get('description') + '';
  if (desc == null) desc = '';
  if (desc.length > 0) {
    var doc = env.newJsoup().parse(desc);
    desc = doc.select('body').first().text();
  }
  saveLink(title, url, desc, env);
}

function grabProduct(astore, node, frompage, topage, env) {
  var tag = env.newArrayList();
  for (var no = frompage; no <= topage; no++) {
    try {
      var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no);
      var doc = env.newJsoup().parse(alink, 60000);
      var elements = doc.select('#featuredProducts .textrow a');
      var map = env.newHashMap();
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var title = element.text();
        var url = element.attr('href');
        var pos = url.lastIndexOf('/detail/');
        if (pos < 0) continue;
        var code = url.substring(pos + 8);
        var url = env.newURL(alink, url) + '';
        var item = env.newHashMap();
        item.put('code', code);
        item.put('title', title);
        item.put('url', url);
        map.put(code, item);
      }
      elements = doc.select('#featuredProducts .imagerow a');
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var url = element.attr('href');
        var pos = url.lastIndexOf('/detail/');
        if (pos < 0) continue;
        var code = url.substring(pos + 8);
        var item = map.get(code);
        if (item == null) continue;
        var child = element.select('img').first();
        if (child == null) continue;
        var title = child.attr('alt');
        var smimg = child.attr('src');
        if (title.length() > 0) {
          item.put('title', title);
        }
        item.put('small-image', smimg);
      }

      var keys = env.getKeys(map);
      for (var i = 0; i < keys.size(); i++) {
        try {
          var item = map.get(keys.get(i));
          alink = env.newURL(item.get('url'));
          doc = env.newJsoup().parse(alink, 60000);
          var element = doc.select('#detailImage img').first();
          if (element != null) {
            item.put('large-image', element.attr('src'));
          }
          element = doc.select('#productDescription').first();
          if (element != null) {
            var desc = element.html();
            var pattern = '<h2>Product Description</h2>';
            var pos = desc.indexOf(pattern);
            if (pos >= 0) {
              desc = desc.substring(pos + pattern.length);
            }
            var bdoc = env.newJsoup().parse(desc, item.get('url'));
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('description', desc);
            }
          }
          element = doc.select('#productDetails').first();
          if (element != null) {
            var desc = element.html();
            var pattern = '<h2>Product Details</h2>';
            var pos = desc.indexOf(pattern);
            if (pos >= 0) {
              desc = desc.substring(pos + pattern.length);
            }
            var bdoc = env.newJsoup().parse(desc, item.get('url'));
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('details', desc);
            }
          }
          element = doc.select('#editorialReviews').first();
          if (element != null) {
            var desc = element.html();
            var bdoc = env.newJsoup().parse(desc, item.get('url') + '');
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('editorial-reviews', desc);
            }
          }
          element = doc.select('#detailListPrice').first();
          if (element != null) {
            item.put('list-price', element.text());
          }
          element = doc.select('#detailOfferPrice').first();
          if (element != null) {
            item.put('offer-price', element.text());
          }
          element = doc.select('#addToCartForm a').first();
          if (element != null) {
            item.put('buy-url', element.attr('href'));
          }
          env.info(node + ' : ' + no + ' : ' + (i + 1) + ' : ' + item.get('url'));
        } catch (e) {
          env.error(e);
        }
      }

      for (var i = 0; i < keys.size(); i++) {
        tag.add(map.get(keys.get(i)));
      }
    } catch (e) {
      env.error(e);
    }
  }
  return tag;
}

function buildURL(doc, baseUrl, env) {
  baseUrl = env.newURL(baseUrl);
  var elements = doc.select('a');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    var url = env.newURL(baseUrl, element.attr('href'));
    element.attr('href', url + '');
  }
  el粐ments = doc.select('img');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    var url = env.newURL(baseUrl, element.attr('src'));
    element.attr('src', url + '');
  }
}

No comments:

Post a Comment