Tuesday, 24 April 2012

Tutorial of using Bewsia - Micro Search Engine for Desktop

  1. Create basic spider
  2. Extract data from web page

Extract data from web page

In order to retrieve a web page, do as following:

1function main(env, args) {
2 var link = 'http://astore.amazon.com/paesia-20?node=22';
3 var timeout = 60000; // milisecond
4 var doc = env.newJsoup().parse(env.newURL(link), timeout);
5}
function main(env, args) {
  var link = 'http://astore.amazon.com/paesia-20?node=22';
  var timeout = 60000; // milisecond
  var doc = env.newJsoup().parse(env.newURL(link), timeout);
}

'doc' variable contains object belonging to Document

With 'doc' variable, data can be extracted as following:

1function main(env, args) {
2 var link = 'http://astore.amazon.com/paesia-20?node=22';
3 var timeout = 60000; // milisecond
4 var doc = env.newJsoup().parse(env.newURL(link), timeout);
5
6 var elements = doc.select('#searchbrowse a');
7 for (var i = 0; i < elements.size(); i++) {
8 var element = elements.get(i);
9 var title = element.text();
10 var url = element.attr('href');
11 env.info('Title: ' + title + '\nUrl: ' + url);
12 }
13}
function main(env, args) {
  var link = 'http://astore.amazon.com/paesia-20?node=22';
  var timeout = 60000; // milisecond
  var doc = env.newJsoup().parse(env.newURL(link), timeout);

  var elements = doc.select('#searchbrowse a');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    var title = element.text();
    var url = element.attr('href');
    env.info('Title: ' + title + '\nUrl: ' + url);
  }
}

'elements' variable is object of Elements class.

'element' variable is object of Element class.

Create basic spider

Each spider is a javascript file containing one main() function

1function main(env, args) {
2 var links = args.get('links');
3 links.add('Hello world!');
4}
function main(env, args) {
  var links = args.get('links');
  links.add('Hello world!');
}

'env' is object of Machine class.

'args' is object of HashMap class.

'args' contains value with 'links' key which is object of ArrayList class. 'links' object will be printed to log after spider finish running.

In spider script, object which belongs to java classes is accessed as in java language. Some methods having return value, paramater which is object of unsupported classes is called with errors. This prevent spider from accessing some restrict resource like file system. Spider is runned in sandbox which is safe for system.

Grab products from Amazon aStores

This spider grab products from Amazon aStores and save to Lucene indexes.

1function main(env, args) {
2 var astore = 'paesia';
3 var node = '';
4 var frompage = 1;
5 var topage = 10000;
6 var batch = 5;
7 var cache = true;
8 if (node.length == 0) {
9 if (!cache) {
10 clearCategoryMarks(env);
11 }
12 var nodelist = loadCategories(env);
13 for (var i = 0; i < nodelist.size(); i++) {
14 node = nodelist.get(i);
15 for (var no = frompage; no <= topage; no += batch) {
16 var min = no;
17 var max = no + batch - 1;
18 if (max > topage) max = topage;
19 var products = grabProduct(astore, node, min, max, env);
20 if (products.size() == 0) break;
21 for (var i = 0; i < products.size(); i++) {
22 var pro = products.get(i);
23 saveProduct(pro, env);
24 }
25 env.info('Saved: ' + products.size());
26 }
27 env.info('Saved all from category: ' + node);
28 markCategory(node, env);
29 }
30 } else {
31 for (var no = frompage; no <= topage; no += batch) {
32 var min = no;
33 var max = no + batch - 1;
34 if (max > topage) max = topage;
35 var products = grabProduct(astore, node, min, max, env);
36 if (products.size() == 0) break;
37 for (var i = 0; i < products.size(); i++) {
38 var pro = products.get(i);
39 saveProduct(pro, env);
40 }
41 env.info('Saved: ' + products.size());
42 }
43 }
44}
45
46function clearCategoryMarks(env) {
47 var entity = env.newEntity();
48 var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE);
49 for (var i = 0; i < results.size(); i++) {
50 results.get(i).setMark('');
51 results.get(i).save();
52 }
53}
54
55function markCategory(node, env) {
56 var cat = env.newEntity();
57 var results = cat.search('Category_Amazon', cat.newTermQuery(cat.newTerm('node', node)), 1);
58 if (results.size() == 0) return;
59 cat = results.get(0);
60 cat.setMark('crawled');
61 cat.save();
62}
63
64function loadCategories(env) {
65 var tag = env.newArrayList();
66 var entity = env.newEntity();
67 var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE);
68 for (var i = 0; i < results.size(); i++) {
69 if (results.get(i).getMark() == 'crawled') continue;
70 tag.add(results.get(i).getString('node'));
71 }
72 return tag;
73}
74
75function saveLink(title, url, desc, env) {
76 if (findLinkByUrl(url, env)) return;
77 var schema = 's|url|a|title|a|desc';
78 var entity = env.newEntity();
79 entity.setSchema(schema);
80 entity.setKind('Link');
81 entity.setId(env.uniqid());
82 entity.setString('url', url);
83 entity.setString('title', title);
84 entity.setString('desc', desc);
85 entity.save();
86}
87
88function findLinkByUrl(url, env) {
89 var entity = env.newEntity();
90 var query = entity.newTermQuery(entity.newTerm('url', url));
91 var size = entity.count('Link', query, 1);
92 return (size > 0);
93}
94
95function saveProduct(pro, env) {
96 var title = pro.get('title');
97 var url = pro.get('url');
98 if (title == null || title.length == 0 || url == null || url.length == 0) return;
99 var desc = pro.get('description') + '';
100 if (desc == null) desc = '';
101 if (desc.length > 0) {
102 var doc = env.newJsoup().parse(desc);
103 desc = doc.select('body').first().text();
104 }
105 saveLink(title, url, desc, env);
106}
107
108function grabProduct(astore, node, frompage, topage, env) {
109 var tag = env.newArrayList();
110 for (var no = frompage; no <= topage; no++) {
111 try {
112 var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no);
113 var doc = env.newJsoup().parse(alink, 60000);
114 var elements = doc.select('#featuredProducts .textrow a');
115 var map = env.newHashMap();
116 for (var i = 0; i < elements.size(); i++) {
117 var element = elements.get(i);
118 var title = element.text();
119 var url = element.attr('href');
120 var pos = url.lastIndexOf('/detail/');
121 if (pos < 0) continue;
122 var code = url.substring(pos + 8);
123 var url = env.newURL(alink, url) + '';
124 var item = env.newHashMap();
125 item.put('code', code);
126 item.put('title', title);
127 item.put('url', url);
128 map.put(code, item);
129 }
130 elements = doc.select('#featuredProducts .imagerow a');
131 for (var i = 0; i < elements.size(); i++) {
132 var element = elements.get(i);
133 var url = element.attr('href');
134 var pos = url.lastIndexOf('/detail/');
135 if (pos < 0) continue;
136 var code = url.substring(pos + 8);
137 var item = map.get(code);
138 if (item == null) continue;
139 var child = element.select('img').first();
140 if (child == null) continue;
141 var title = child.attr('alt');
142 var smimg = child.attr('src');
143 if (title.length() > 0) {
144 item.put('title', title);
145 }
146 item.put('small-image', smimg);
147 }
148
149 var keys = env.getKeys(map);
150 for (var i = 0; i < keys.size(); i++) {
151 try {
152 var item = map.get(keys.get(i));
153 alink = env.newURL(item.get('url'));
154 doc = env.newJsoup().parse(alink, 60000);
155 var element = doc.select('#detailImage img').first();
156 if (element != null) {
157 item.put('large-image', element.attr('src'));
158 }
159 element = doc.select('#productDescription').first();
160 if (element != null) {
161 var desc = element.html();
162 var pattern = '<h2>Product Description</h2>';
163 var pos = desc.indexOf(pattern);
164 if (pos >= 0) {
165 desc = desc.substring(pos + pattern.length);
166 }
167 var bdoc = env.newJsoup().parse(desc, item.get('url'));
168 buildURL(bdoc, item.get('url'), env);
169 desc = bdoc.select('body').first().html();
170 if (desc.indexOf('<html') < 0) {
171 item.put('description', desc);
172 }
173 }
174 element = doc.select('#productDetails').first();
175 if (element != null) {
176 var desc = element.html();
177 var pattern = '<h2>Product Details</h2>';
178 var pos = desc.indexOf(pattern);
179 if (pos >= 0) {
180 desc = desc.substring(pos + pattern.length);
181 }
182 var bdoc = env.newJsoup().parse(desc, item.get('url'));
183 buildURL(bdoc, item.get('url'), env);
184 desc = bdoc.select('body').first().html();
185 if (desc.indexOf('<html') < 0) {
186 item.put('details', desc);
187 }
188 }
189 element = doc.select('#editorialReviews').first();
190 if (element != null) {
191 var desc = element.html();
192 var bdoc = env.newJsoup().parse(desc, item.get('url') + '');
193 buildURL(bdoc, item.get('url'), env);
194 desc = bdoc.select('body').first().html();
195 if (desc.indexOf('<html') < 0) {
196 item.put('editorial-reviews', desc);
197 }
198 }
199 element = doc.select('#detailListPrice').first();
200 if (element != null) {
201 item.put('list-price', element.text());
202 }
203 element = doc.select('#detailOfferPrice').first();
204 if (element != null) {
205 item.put('offer-price', element.text());
206 }
207 element = doc.select('#addToCartForm a').first();
208 if (element != null) {
209 item.put('buy-url', element.attr('href'));
210 }
211 env.info(node + ' : ' + no + ' : ' + (i + 1) + ' : ' + item.get('url'));
212 } catch (e) {
213 env.error(e);
214 }
215 }
216
217 for (var i = 0; i < keys.size(); i++) {
218 tag.add(map.get(keys.get(i)));
219 }
220 } catch (e) {
221 env.error(e);
222 }
223 }
224 return tag;
225}
226
227function buildURL(doc, baseUrl, env) {
228 baseUrl = env.newURL(baseUrl);
229 var elements = doc.select('a');
230 for (var i = 0; i < elements.size(); i++) {
231 var element = elements.get(i);
232 var url = env.newURL(baseUrl, element.attr('href'));
233 element.attr('href', url + '');
234 }
235 el粐ments = doc.select('img');
236 for (var i = 0; i < elements.size(); i++) {
237 var element = elements.get(i);
238 var url = env.newURL(baseUrl, element.attr('src'));
239 element.attr('src', url + '');
240 }
241}
function main(env, args) {
  var astore = 'paesia';
  var node = '';
  var frompage = 1;
  var topage = 10000;
  var batch = 5;
  var cache = true;
  if (node.length == 0) {
    if (!cache) {
      clearCategoryMarks(env);
    }
    var nodelist = loadCategories(env);
    for (var i = 0; i < nodelist.size(); i++) {
      node = nodelist.get(i);
      for (var no = frompage; no <= topage; no += batch) {
        var min = no;
        var max = no + batch - 1;
        if (max > topage) max = topage;
        var products = grabProduct(astore, node, min, max, env);
        if (products.size() == 0) break;
        for (var i = 0; i < products.size(); i++) {
          var pro = products.get(i);
          saveProduct(pro, env);
        }
        env.info('Saved: ' + products.size());
      }
      env.info('Saved all from category: ' + node);
      markCategory(node, env);
    }
  } else {
    for (var no = frompage; no <= topage; no += batch) {
      var min = no;
      var max = no + batch - 1;
      if (max > topage) max = topage;
      var products = grabProduct(astore, node, min, max, env);
      if (products.size() == 0) break;
      for (var i = 0; i < products.size(); i++) {
        var pro = products.get(i);
        saveProduct(pro, env);
      }
      env.info('Saved: ' + products.size());
    }
  }
}

function clearCategoryMarks(env) {
  var entity = env.newEntity();
  var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE);
  for (var i = 0; i < results.size(); i++) {
    results.get(i).setMark('');
    results.get(i).save();
  }
}

function markCategory(node, env) {
  var cat = env.newEntity();
  var results = cat.search('Category_Amazon', cat.newTermQuery(cat.newTerm('node', node)), 1);
  if (results.size() == 0) return;
  cat = results.get(0);
  cat.setMark('crawled');
  cat.save();
}

function loadCategories(env) {
  var tag = env.newArrayList();
  var entity = env.newEntity();
  var results = entity.search('Category_Amazon', entity.newMatchAllDocsQuery(), java.lang.Integer.MAX_VALUE);
  for (var i = 0; i < results.size(); i++) {
    if (results.get(i).getMark() == 'crawled') continue;
    tag.add(results.get(i).getString('node'));
  }
  return tag;
}

function saveLink(title, url, desc, env) {
  if (findLinkByUrl(url, env)) return;
  var schema = 's|url|a|title|a|desc';
  var entity = env.newEntity();
  entity.setSchema(schema);
  entity.setKind('Link');
  entity.setId(env.uniqid());
  entity.setString('url', url);
  entity.setString('title', title);
  entity.setString('desc', desc);
  entity.save();
}

function findLinkByUrl(url, env) {
  var entity = env.newEntity();
  var query = entity.newTermQuery(entity.newTerm('url', url));
  var size = entity.count('Link', query, 1);
  return (size > 0);
}

function saveProduct(pro, env) {
  var title = pro.get('title');
  var url = pro.get('url');
  if (title == null || title.length == 0 || url == null || url.length == 0) return;
  var desc = pro.get('description') + '';
  if (desc == null) desc = '';
  if (desc.length > 0) {
    var doc = env.newJsoup().parse(desc);
    desc = doc.select('body').first().text();
  }
  saveLink(title, url, desc, env);
}

function grabProduct(astore, node, frompage, topage, env) {
  var tag = env.newArrayList();
  for (var no = frompage; no <= topage; no++) {
    try {
      var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no);
      var doc = env.newJsoup().parse(alink, 60000);
      var elements = doc.select('#featuredProducts .textrow a');
      var map = env.newHashMap();
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var title = element.text();
        var url = element.attr('href');
        var pos = url.lastIndexOf('/detail/');
        if (pos < 0) continue;
        var code = url.substring(pos + 8);
        var url = env.newURL(alink, url) + '';
        var item = env.newHashMap();
        item.put('code', code);
        item.put('title', title);
        item.put('url', url);
        map.put(code, item);
      }
      elements = doc.select('#featuredProducts .imagerow a');
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var url = element.attr('href');
        var pos = url.lastIndexOf('/detail/');
        if (pos < 0) continue;
        var code = url.substring(pos + 8);
        var item = map.get(code);
        if (item == null) continue;
        var child = element.select('img').first();
        if (child == null) continue;
        var title = child.attr('alt');
        var smimg = child.attr('src');
        if (title.length() > 0) {
          item.put('title', title);
        }
        item.put('small-image', smimg);
      }

      var keys = env.getKeys(map);
      for (var i = 0; i < keys.size(); i++) {
        try {
          var item = map.get(keys.get(i));
          alink = env.newURL(item.get('url'));
          doc = env.newJsoup().parse(alink, 60000);
          var element = doc.select('#detailImage img').first();
          if (element != null) {
            item.put('large-image', element.attr('src'));
          }
          element = doc.select('#productDescription').first();
          if (element != null) {
            var desc = element.html();
            var pattern = '<h2>Product Description</h2>';
            var pos = desc.indexOf(pattern);
            if (pos >= 0) {
              desc = desc.substring(pos + pattern.length);
            }
            var bdoc = env.newJsoup().parse(desc, item.get('url'));
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('description', desc);
            }
          }
          element = doc.select('#productDetails').first();
          if (element != null) {
            var desc = element.html();
            var pattern = '<h2>Product Details</h2>';
            var pos = desc.indexOf(pattern);
            if (pos >= 0) {
              desc = desc.substring(pos + pattern.length);
            }
            var bdoc = env.newJsoup().parse(desc, item.get('url'));
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('details', desc);
            }
          }
          element = doc.select('#editorialReviews').first();
          if (element != null) {
            var desc = element.html();
            var bdoc = env.newJsoup().parse(desc, item.get('url') + '');
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('editorial-reviews', desc);
            }
          }
          element = doc.select('#detailListPrice').first();
          if (element != null) {
            item.put('list-price', element.text());
          }
          element = doc.select('#detailOfferPrice').first();
          if (element != null) {
            item.put('offer-price', element.text());
          }
          element = doc.select('#addToCartForm a').first();
          if (element != null) {
            item.put('buy-url', element.attr('href'));
          }
          env.info(node + ' : ' + no + ' : ' + (i + 1) + ' : ' + item.get('url'));
        } catch (e) {
          env.error(e);
        }
      }

      for (var i = 0; i < keys.size(); i++) {
        tag.add(map.get(keys.get(i)));
      }
    } catch (e) {
      env.error(e);
    }
  }
  return tag;
}

function buildURL(doc, baseUrl, env) {
  baseUrl = env.newURL(baseUrl);
  var elements = doc.select('a');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    var url = env.newURL(baseUrl, element.attr('href'));
    element.attr('href', url + '');
  }
  el粐ments = doc.select('img');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    var url = env.newURL(baseUrl, element.attr('src'));
    element.attr('src', url + '');
  }
}

Grab categories from Amazon aStores

This spider grab categories from Amazon aStores and save to Lucene indexes.

1function main(env, args) {
2 var astore = 'paesia';
3 try {
4 var categories = grabCategory(astore, env);
5 var map = env.newHashMap();
6 for (var i = 0; i < categories.size(); i++) {
7 var cat = categories.get(i);
8 map.put(cat.get('node'), cat);
9 }
10 for (var i = 0; i < categories.size(); i++) {
11 var cat = categories.get(i);
12 var node = cat.get('node');
13 var title = cat.get('title');
14 var parent = cat.get('parent');
15 saveCategory(title, node, parent, env);
16 }
17 } catch (e) {
18 env.error(e);
19 }
20}
21
22function saveCategory(title, node, parent, env) {
23 if (findCategoryByNode(node, env)) return;
24 var schema = 's|node|s|title|s|parent';
25 var entity = env.newEntity();
26 entity.setSchema(schema);
27 entity.setKind('Category_Amazon');
28 entity.setId(env.uniqid());
29 entity.setString('node', node);
30 entity.setString('title', title);
31 entity.setString('parent', parent);
32 entity.save();
33}
34
35function findCategoryByNode(node, env) {
36 var entity = env.newEntity();
37 var query = entity.newTermQuery(entity.newTerm('node', node));
38 var size = entity.count('Category_Amazon', query, 1);
39 return (size > 0);
40}
41
42function grabCategory(astore, env) {
43 var tag = env.newArrayList();
44 try {
45 var nodelist = env.newArrayList();
46 var alink = env.newURL('http://astore.amazon.com/' + astore + '-20');
47 var doc = env.newJsoup().parse(alink, 60000);
48 var elements = doc.select('#searchbrowse a');
49 for (var i = 0; i < elements.size(); i++) {
50 var element = elements.get(i);
51 var title = element.text();
52 var url = element.attr('href');
53 var pos = url.lastIndexOf('node=');
54 if (pos < 0) continue;
55 var node = url.substring(pos + 5);
56 pos = node.indexOf('&');
57 if (pos >= 0) {
58 node = node.substring(0, pos);
59 }
60 var item = env.newHashMap();
61 item.put('title', title);
62 item.put('node', node);
63 item.put('parent', '');
64 tag.add(item);
65 nodelist.add(node);
66 env.info(node + ' : ' + title);
67 }
68 var no = 0;
69 while (no < nodelist.size()) {
70 alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + nodelist.get(no));
71 doc = env.newJsoup().parse(alink, 60000);
72 elements = doc.select('#searchbrowse .indent a');
73 for (var i = 0; i < elements.size(); i++) {
74 var element = elements.get(i);
75 var title = element.text();
76 var url = element.attr('href');
77 var pos = url.lastIndexOf('node=');
78 if (pos < 0) continue;
79 var node = url.substring(pos + 5);
80 pos = node.indexOf('&');
81 if (pos >= 0) {
82 node = node.substring(0, pos);
83 }
84 if (nodelist.indexOf(node) >= 0) continue;
85 var item = env.newHashMap();
86 item.put('title', title);
87 item.put('node', node);
88 item.put('parent', nodelist.get(no));
89 tag.add(item);
90 nodelist.add(node);
91 env.info(node + ' : ' + title);
92 }
93 no++;
94 }
95 } catch (e) {
96 env.error(e);
97 }
98 return tag;
99}
function main(env, args) {
  var astore = 'paesia';
  try {
    var categories = grabCategory(astore, env);
    var map = env.newHashMap();
    for (var i = 0; i < categories.size(); i++) {
      var cat = categories.get(i);
      map.put(cat.get('node'), cat);
    }
    for (var i = 0; i < categories.size(); i++) {
      var cat = categories.get(i);
      var node = cat.get('node');
      var title = cat.get('title');
      var parent = cat.get('parent');
      saveCategory(title, node, parent, env);
    }
  } catch (e) {
    env.error(e);
  }
}

function saveCategory(title, node, parent, env) {
  if (findCategoryByNode(node, env)) return;
  var schema = 's|node|s|title|s|parent';
  var entity = env.newEntity();
  entity.setSchema(schema);
  entity.setKind('Category_Amazon');
  entity.setId(env.uniqid());
  entity.setString('node', node);
  entity.setString('title', title);
  entity.setString('parent', parent);
  entity.save();
}

function findCategoryByNode(node, env) {
  var entity = env.newEntity();
  var query = entity.newTermQuery(entity.newTerm('node', node));
  var size = entity.count('Category_Amazon', query, 1);
  return (size > 0);
}

function grabCategory(astore, env) {
  var tag = env.newArrayList();
  try {
    var nodelist = env.newArrayList();
    var alink = env.newURL('http://astore.amazon.com/' + astore + '-20');
    var doc = env.newJsoup().parse(alink, 60000);
    var elements = doc.select('#searchbrowse a');
    for (var i = 0; i < elements.size(); i++) {
      var element = elements.get(i);
      var title = element.text();
      var url = element.attr('href');
      var pos = url.lastIndexOf('node=');
      if (pos < 0) continue;
      var node = url.substring(pos + 5);
      pos = node.indexOf('&');
      if (pos >= 0) {
        node = node.substring(0, pos);
      }
      var item = env.newHashMap();
      item.put('title', title);
      item.put('node', node);
      item.put('parent', '');
      tag.add(item);
      nodelist.add(node);
      env.info(node + ' : ' + title);
    }
    var no = 0;
    while (no < nodelist.size()) {
      alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + nodelist.get(no));
      doc = env.newJsoup().parse(alink, 60000);
      elements = doc.select('#searchbrowse .indent a');
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var title = element.text();
        var url = element.attr('href');
        var pos = url.lastIndexOf('node=');
        if (pos < 0) continue;
        var node = url.substring(pos + 5);
        pos = node.indexOf('&');
        if (pos >= 0) {
          node = node.substring(0, pos);
        }
        if (nodelist.indexOf(node) >= 0) continue;
        var item = env.newHashMap();
        item.put('title', title);
        item.put('node', node);
        item.put('parent', nodelist.get(no));
        tag.add(item);
        nodelist.add(node);
        env.info(node + ' : ' + title);
      }
      no++;
    }
  } catch (e) {
    env.error(e);
  }
  return tag;
}

Screenshots of Bewsia - Micro Search Engine for Desktop