Tuesday, 24 April 2012

Grab categories from Amazon aStores

This spider grab categories from Amazon aStores and save to Lucene indexes.

1function main(env, args) {
2 var astore = 'paesia';
3 try {
4 var categories = grabCategory(astore, env);
5 var map = env.newHashMap();
6 for (var i = 0; i < categories.size(); i++) {
7 var cat = categories.get(i);
8 map.put(cat.get('node'), cat);
9 }
10 for (var i = 0; i < categories.size(); i++) {
11 var cat = categories.get(i);
12 var node = cat.get('node');
13 var title = cat.get('title');
14 var parent = cat.get('parent');
15 saveCategory(title, node, parent, env);
16 }
17 } catch (e) {
18 env.error(e);
19 }
20}
21
22function saveCategory(title, node, parent, env) {
23 if (findCategoryByNode(node, env)) return;
24 var schema = 's|node|s|title|s|parent';
25 var entity = env.newEntity();
26 entity.setSchema(schema);
27 entity.setKind('Category_Amazon');
28 entity.setId(env.uniqid());
29 entity.setString('node', node);
30 entity.setString('title', title);
31 entity.setString('parent', parent);
32 entity.save();
33}
34
35function findCategoryByNode(node, env) {
36 var entity = env.newEntity();
37 var query = entity.newTermQuery(entity.newTerm('node', node));
38 var size = entity.count('Category_Amazon', query, 1);
39 return (size > 0);
40}
41
42function grabCategory(astore, env) {
43 var tag = env.newArrayList();
44 try {
45 var nodelist = env.newArrayList();
46 var alink = env.newURL('http://astore.amazon.com/' + astore + '-20');
47 var doc = env.newJsoup().parse(alink, 60000);
48 var elements = doc.select('#searchbrowse a');
49 for (var i = 0; i < elements.size(); i++) {
50 var element = elements.get(i);
51 var title = element.text();
52 var url = element.attr('href');
53 var pos = url.lastIndexOf('node=');
54 if (pos < 0) continue;
55 var node = url.substring(pos + 5);
56 pos = node.indexOf('&');
57 if (pos >= 0) {
58 node = node.substring(0, pos);
59 }
60 var item = env.newHashMap();
61 item.put('title', title);
62 item.put('node', node);
63 item.put('parent', '');
64 tag.add(item);
65 nodelist.add(node);
66 env.info(node + ' : ' + title);
67 }
68 var no = 0;
69 while (no < nodelist.size()) {
70 alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + nodelist.get(no));
71 doc = env.newJsoup().parse(alink, 60000);
72 elements = doc.select('#searchbrowse .indent a');
73 for (var i = 0; i < elements.size(); i++) {
74 var element = elements.get(i);
75 var title = element.text();
76 var url = element.attr('href');
77 var pos = url.lastIndexOf('node=');
78 if (pos < 0) continue;
79 var node = url.substring(pos + 5);
80 pos = node.indexOf('&');
81 if (pos >= 0) {
82 node = node.substring(0, pos);
83 }
84 if (nodelist.indexOf(node) >= 0) continue;
85 var item = env.newHashMap();
86 item.put('title', title);
87 item.put('node', node);
88 item.put('parent', nodelist.get(no));
89 tag.add(item);
90 nodelist.add(node);
91 env.info(node + ' : ' + title);
92 }
93 no++;
94 }
95 } catch (e) {
96 env.error(e);
97 }
98 return tag;
99}
function main(env, args) {
  var astore = 'paesia';
  try {
    var categories = grabCategory(astore, env);
    var map = env.newHashMap();
    for (var i = 0; i < categories.size(); i++) {
      var cat = categories.get(i);
      map.put(cat.get('node'), cat);
    }
    for (var i = 0; i < categories.size(); i++) {
      var cat = categories.get(i);
      var node = cat.get('node');
      var title = cat.get('title');
      var parent = cat.get('parent');
      saveCategory(title, node, parent, env);
    }
  } catch (e) {
    env.error(e);
  }
}

function saveCategory(title, node, parent, env) {
  if (findCategoryByNode(node, env)) return;
  var schema = 's|node|s|title|s|parent';
  var entity = env.newEntity();
  entity.setSchema(schema);
  entity.setKind('Category_Amazon');
  entity.setId(env.uniqid());
  entity.setString('node', node);
  entity.setString('title', title);
  entity.setString('parent', parent);
  entity.save();
}

function findCategoryByNode(node, env) {
  var entity = env.newEntity();
  var query = entity.newTermQuery(entity.newTerm('node', node));
  var size = entity.count('Category_Amazon', query, 1);
  return (size > 0);
}

function grabCategory(astore, env) {
  var tag = env.newArrayList();
  try {
    var nodelist = env.newArrayList();
    var alink = env.newURL('http://astore.amazon.com/' + astore + '-20');
    var doc = env.newJsoup().parse(alink, 60000);
    var elements = doc.select('#searchbrowse a');
    for (var i = 0; i < elements.size(); i++) {
      var element = elements.get(i);
      var title = element.text();
      var url = element.attr('href');
      var pos = url.lastIndexOf('node=');
      if (pos < 0) continue;
      var node = url.substring(pos + 5);
      pos = node.indexOf('&');
      if (pos >= 0) {
        node = node.substring(0, pos);
      }
      var item = env.newHashMap();
      item.put('title', title);
      item.put('node', node);
      item.put('parent', '');
      tag.add(item);
      nodelist.add(node);
      env.info(node + ' : ' + title);
    }
    var no = 0;
    while (no < nodelist.size()) {
      alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + nodelist.get(no));
      doc = env.newJsoup().parse(alink, 60000);
      elements = doc.select('#searchbrowse .indent a');
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var title = element.text();
        var url = element.attr('href');
        var pos = url.lastIndexOf('node=');
        if (pos < 0) continue;
        var node = url.substring(pos + 5);
        pos = node.indexOf('&');
        if (pos >= 0) {
          node = node.substring(0, pos);
        }
        if (nodelist.indexOf(node) >= 0) continue;
        var item = env.newHashMap();
        item.put('title', title);
        item.put('node', node);
        item.put('parent', nodelist.get(no));
        tag.add(item);
        nodelist.add(node);
        env.info(node + ' : ' + title);
      }
      no++;
    }
  } catch (e) {
    env.error(e);
  }
  return tag;
}

No comments:

Post a Comment