Crawling pages using PhantomJS
I used PhantomJS in my CORS Planner project to crawl module codes from NUS website. It is really simple to pick up some code from the examples provided in PhantomJS site. My code look like this:
var page = require("webpage").create();
// Route "console.log()" calls from within the Page context
// to the main Phantom context (i.e. current "this")
page.onConsoleMessage = function(msg) {
console.log(msg);
};
page.open(encodeURI(url), function (status) {
if (status !== "success") {
console.log("===! Unable to access network\n");
} else {
console.log("===> Page Loaded");
var result = page.evaluate(function() {
// Execute some DOM inspection within the page context
// ...
return result;
});
}
phantom.exit();
});
// Full Source Code: http://bit.ly/12gHAdw
Last month, I added support on NTU modules in CORS Planner. However, NTU modules are distributed in 339+ different webpages. Crawling page after page in sequence is far too slow. So a better way would be running several PhantomJS webpage
together and each completes a part of the crawling. The code: (thread is the number of webpages
, not a real thread. webpages
run in asynchronous)
// list contains all the NTU pages to be visited
var llength = list.length, thread = 10, completed = 0
// max is the maximum # of pages to be visited for each thread
, max = ((llength / thread) | 0) + 1, i;
for (i = 0; i < thread; i++) {
var aPage = webpage.create();
aPage.onConsoleMessage = function(msg) {
console.log(msg);
};
visitPage(aPage, max * i, max * (i + 1));
}
function visitPage(page, idx, max) {
// exit
if (completed >= llength) {
page.close();
phantom.exit();
return ;
} else if (idx === max || idx >= llength) {
page.close();
return ;
}
page.open(encodeURI(url(list[idx])), function (status) {
if (status !== "success") {
console.log("===! Unable to access network\n");
} else {
var result = page.evaluate(function() {
// Execute some DOM inspection within the page context
// ...
return result;
});
// important to keep the # of pages completed
completed++;
visitPage(page, idx + 1, max);
}
});
}
// Full Source Code: http://bit.ly/12GNeGq
Run $ phantomjs crawl-phantomjs.js -t #
in command window, where #
is the thread you want.
The performance comparison for different number of pages:
Thread | Pages Crawled | Time Taken | Avg Time/Page |
---|---|---|---|
1 | 30 | 152.92s | 5.09s |
9 | 40 | 52.38s | 1.31s |
19 | 40 | 39.95s | 0.99s |
29 | 40 | 32.66s | 0.82s |
Note, Sometimes the webpage
stops if you have slow network.
Extra, Some crawling plugins in Node.js: