Tuesday 24 April 2012

Grab categories from Amazon aStores

This spider grab categories from Amazon aStores and save to Lucene indexes.

function main(env, args) {
  var astore = 'paesia';
  try {
    var categories = grabCategory(astore, env);
    var map = env.newHashMap();
    for (var i = 0; i < categories.size(); i++) {
      var cat = categories.get(i);
      map.put(cat.get('node'), cat);
    }
    for (var i = 0; i < categories.size(); i++) {
      var cat = categories.get(i);
      var node = cat.get('node');
      var title = cat.get('title');
      var parent = cat.get('parent');
      saveCategory(title, node, parent, env);
    }
  } catch (e) {
    env.error(e);
  }
}

function saveCategory(title, node, parent, env) {
  if (findCategoryByNode(node, env)) return;
  var schema = 's|node|s|title|s|parent';
  var entity = env.newEntity();
  entity.setSchema(schema);
  entity.setKind('Category_Amazon');
  entity.setId(env.uniqid());
  entity.setString('node', node);
  entity.setString('title', title);
  entity.setString('parent', parent);
  entity.save();
}

function findCategoryByNode(node, env) {
  var entity = env.newEntity();
  var query = entity.newTermQuery(entity.newTerm('node', node));
  var size = entity.count('Category_Amazon', query, 1);
  return (size > 0);
}

function grabCategory(astore, env) {
  var tag = env.newArrayList();
  try {
    var nodelist = env.newArrayList();
    var alink = env.newURL('http://astore.amazon.com/' + astore + '-20');
    var doc = env.newJsoup().parse(alink, 60000);
    var elements = doc.select('#searchbrowse a');
    for (var i = 0; i < elements.size(); i++) {
      var element = elements.get(i);
      var title = element.text();
      var url = element.attr('href');
      var pos = url.lastIndexOf('node=');
      if (pos < 0) continue;
      var node = url.substring(pos + 5);
      pos = node.indexOf('&');
      if (pos >= 0) {
        node = node.substring(0, pos);
      }
      var item = env.newHashMap();
      item.put('title', title);
      item.put('node', node);
      item.put('parent', '');
      tag.add(item);
      nodelist.add(node);
      env.info(node + ' : ' + title);
    }
    var no = 0;
    while (no < nodelist.size()) {
      alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + nodelist.get(no));
      doc = env.newJsoup().parse(alink, 60000);
      elements = doc.select('#searchbrowse .indent a');
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var title = element.text();
        var url = element.attr('href');
        var pos = url.lastIndexOf('node=');
        if (pos < 0) continue;
        var node = url.substring(pos + 5);
        pos = node.indexOf('&');
        if (pos >= 0) {
          node = node.substring(0, pos);
        }
        if (nodelist.indexOf(node) >= 0) continue;
        var item = env.newHashMap();
        item.put('title', title);
        item.put('node', node);
        item.put('parent', nodelist.get(no));
        tag.add(item);
        nodelist.add(node);
        env.info(node + ' : ' + title);
      }
      no++;
    }
  } catch (e) {
    env.error(e);
  }
  return tag;
}

No comments:

Post a Comment