Tuesday, 15 March 2011

groovy - Crawler4j With Grails App -



groovy - Crawler4j With Grails App -

i making crawler application in groovy on grails. using crawler4j , next this tutorial.

i created new grails project put basiccrawlcontroller.groovy file in controllers->package did not create view because expected on doing run-app, crawled info appear in crawlstoragefolder (please right me if understanding flawed)

after ran application doing run-app didn't see crawling info anywhere.

am right in expecting file created @ crawlstoragefolder location have given c:/crawl/crawler4jstorage? do need create view this? if want invoke crawler controller other view on click of submit button of form, can write <g:form name="submitwebsite" url="[controller:'basiccrawlcontroller ']">?

i asked because not have method in controller, right way invoke controller?

my code follows:

//all necessary imports public class basiccrawlcontroller { static main(args) throws exception { string crawlstoragefolder = "c:/crawl/crawler4jstorage"; int numberofcrawlers = 1; //int maxdepthofcrawling = -1; default crawlconfig config = new crawlconfig(); config.setcrawlstoragefolder(crawlstoragefolder); config.setpolitenessdelay(1000); config.setmaxpagestofetch(100); config.setresumablecrawling(false); pagefetcher pagefetcher = new pagefetcher(config); robotstxtconfig robotstxtconfig = new robotstxtconfig(); robotstxtserver robotstxtserver = new robotstxtserver(robotstxtconfig, pagefetcher); crawlcontroller controller = new crawlcontroller(config, pagefetcher, robotstxtserver); controller.addseed("http://en.wikipedia.org/wiki/web_crawler") controller.start(basiccrawler.class, 1); } } class basiccrawler extends webcrawler { final static pattern filters = pattern .compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$") /** * should implement function specify whether given url * should crawled or not (based on crawling logic). */ @override boolean shouldvisit(weburl url) { string href = url.geturl().tolowercase() !filters.matcher(href).matches() && href.startswith("http://en.wikipedia.org/wiki/web_crawler/") } /** * function called when page fetched , ready processed * program. */ @override void visit(page page) { int docid = page.getweburl().getdocid() string url = page.getweburl().geturl() string domain = page.getweburl().getdomain() string path = page.getweburl().getpath() string subdomain = page.getweburl().getsubdomain() string parenturl = page.getweburl().getparenturl() string anchor = page.getweburl().getanchor() println("docid: ${docid} ") println("url: ${url} ") println("domain: '${domain}'") println("sub-domain: ' ${subdomain}'") println("path: '${path}'") println("parent page:${parenturl} ") println("anchor text: ${anchor} " ) if (page.getparsedata() instanceof htmlparsedata) { htmlparsedata htmlparsedata = (htmlparsedata) page.getparsedata() string text = htmlparsedata.gettext() string html = htmlparsedata.gethtml() list<weburl> links = htmlparsedata.getoutgoingurls() println("text length: " + text.length()) println("html length: " + html.length()) println("number of outgoing links: " + links.size()) } header[] responseheaders = page.getfetchresponseheaders() if (responseheaders != null) { println("response headers:") (header header : responseheaders) { println("\t ${header.getname()} : ${header.getvalue()}") } } println("=============") } }

i'll seek translate code grails standard.

use under grails-app/controller

class basiccrawlcontroller { def index() { string crawlstoragefolder = "c:/crawl/crawler4jstorage"; int numberofcrawlers = 1; //int maxdepthofcrawling = -1; default crawlconfig crawlconfig = new crawlconfig(); crawlconfig.setcrawlstoragefolder(crawlstoragefolder); crawlconfig.setpolitenessdelay(1000); crawlconfig.setmaxpagestofetch(100); crawlconfig.setresumablecrawling(false); pagefetcher pagefetcher = new pagefetcher(crawlconfig); robotstxtconfig robotstxtconfig = new robotstxtconfig(); robotstxtserver robotstxtserver = new robotstxtserver(robotstxtconfig, pagefetcher); crawlcontroller controller = new crawlcontroller(crawlconfig, pagefetcher, robotstxtserver); controller.addseed("http://en.wikipedia.org/wiki/web_crawler") controller.start(basiccrawler.class, 1); render "done crawling" } }

use under src/groovy

class basiccrawler extends webcrawler { final static pattern filters = pattern .compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$") /** * should implement function specify whether given url * should crawled or not (based on crawling logic). */ @override boolean shouldvisit(weburl url) { string href = url.geturl().tolowercase() !filters.matcher(href).matches() && href.startswith("http://en.wikipedia.org/wiki/web_crawler/") } /** * function called when page fetched , ready processed * program. */ @override void visit(page page) { int docid = page.getweburl().getdocid() string url = page.getweburl().geturl() string domain = page.getweburl().getdomain() string path = page.getweburl().getpath() string subdomain = page.getweburl().getsubdomain() string parenturl = page.getweburl().getparenturl() string anchor = page.getweburl().getanchor() println("docid: ${docid} ") println("url: ${url} ") println("domain: '${domain}'") println("sub-domain: ' ${subdomain}'") println("path: '${path}'") println("parent page:${parenturl} ") println("anchor text: ${anchor} " ) if (page.getparsedata() instanceof htmlparsedata) { htmlparsedata htmlparsedata = (htmlparsedata) page.getparsedata() string text = htmlparsedata.gettext() string html = htmlparsedata.gethtml() list<weburl> links = htmlparsedata.getoutgoingurls() println("text length: " + text.length()) println("html length: " + html.length()) println("number of outgoing links: " + links.size()) } header[] responseheaders = page.getfetchresponseheaders() if (responseheaders != null) { println("response headers:") (header header : responseheaders) { println("\t ${header.getname()} : ${header.getvalue()}") } } println("=============") } }

grails groovy crawler4j

No comments:

Post a Comment