groovy - Crawler4j With Grails App -
i making crawler application in groovy on grails. using crawler4j , next this tutorial.
i created new grails project put basiccrawlcontroller.groovy file in controllers->package did not create view because expected on doing run-app, crawled info appear in crawlstoragefolder (please right me if understanding flawed)after ran application doing run-app
didn't see crawling info anywhere.
<g:form name="submitwebsite" url="[controller:'basiccrawlcontroller ']">
? i asked because not have method in controller, right way invoke controller?
my code follows:
//all necessary imports public class basiccrawlcontroller { static main(args) throws exception { string crawlstoragefolder = "c:/crawl/crawler4jstorage"; int numberofcrawlers = 1; //int maxdepthofcrawling = -1; default crawlconfig config = new crawlconfig(); config.setcrawlstoragefolder(crawlstoragefolder); config.setpolitenessdelay(1000); config.setmaxpagestofetch(100); config.setresumablecrawling(false); pagefetcher pagefetcher = new pagefetcher(config); robotstxtconfig robotstxtconfig = new robotstxtconfig(); robotstxtserver robotstxtserver = new robotstxtserver(robotstxtconfig, pagefetcher); crawlcontroller controller = new crawlcontroller(config, pagefetcher, robotstxtserver); controller.addseed("http://en.wikipedia.org/wiki/web_crawler") controller.start(basiccrawler.class, 1); } } class basiccrawler extends webcrawler { final static pattern filters = pattern .compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$") /** * should implement function specify whether given url * should crawled or not (based on crawling logic). */ @override boolean shouldvisit(weburl url) { string href = url.geturl().tolowercase() !filters.matcher(href).matches() && href.startswith("http://en.wikipedia.org/wiki/web_crawler/") } /** * function called when page fetched , ready processed * program. */ @override void visit(page page) { int docid = page.getweburl().getdocid() string url = page.getweburl().geturl() string domain = page.getweburl().getdomain() string path = page.getweburl().getpath() string subdomain = page.getweburl().getsubdomain() string parenturl = page.getweburl().getparenturl() string anchor = page.getweburl().getanchor() println("docid: ${docid} ") println("url: ${url} ") println("domain: '${domain}'") println("sub-domain: ' ${subdomain}'") println("path: '${path}'") println("parent page:${parenturl} ") println("anchor text: ${anchor} " ) if (page.getparsedata() instanceof htmlparsedata) { htmlparsedata htmlparsedata = (htmlparsedata) page.getparsedata() string text = htmlparsedata.gettext() string html = htmlparsedata.gethtml() list<weburl> links = htmlparsedata.getoutgoingurls() println("text length: " + text.length()) println("html length: " + html.length()) println("number of outgoing links: " + links.size()) } header[] responseheaders = page.getfetchresponseheaders() if (responseheaders != null) { println("response headers:") (header header : responseheaders) { println("\t ${header.getname()} : ${header.getvalue()}") } } println("=============") } }
i'll seek translate code grails standard.
use under grails-app/controller
class basiccrawlcontroller { def index() { string crawlstoragefolder = "c:/crawl/crawler4jstorage"; int numberofcrawlers = 1; //int maxdepthofcrawling = -1; default crawlconfig crawlconfig = new crawlconfig(); crawlconfig.setcrawlstoragefolder(crawlstoragefolder); crawlconfig.setpolitenessdelay(1000); crawlconfig.setmaxpagestofetch(100); crawlconfig.setresumablecrawling(false); pagefetcher pagefetcher = new pagefetcher(crawlconfig); robotstxtconfig robotstxtconfig = new robotstxtconfig(); robotstxtserver robotstxtserver = new robotstxtserver(robotstxtconfig, pagefetcher); crawlcontroller controller = new crawlcontroller(crawlconfig, pagefetcher, robotstxtserver); controller.addseed("http://en.wikipedia.org/wiki/web_crawler") controller.start(basiccrawler.class, 1); render "done crawling" } }
use under src/groovy
class basiccrawler extends webcrawler { final static pattern filters = pattern .compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$") /** * should implement function specify whether given url * should crawled or not (based on crawling logic). */ @override boolean shouldvisit(weburl url) { string href = url.geturl().tolowercase() !filters.matcher(href).matches() && href.startswith("http://en.wikipedia.org/wiki/web_crawler/") } /** * function called when page fetched , ready processed * program. */ @override void visit(page page) { int docid = page.getweburl().getdocid() string url = page.getweburl().geturl() string domain = page.getweburl().getdomain() string path = page.getweburl().getpath() string subdomain = page.getweburl().getsubdomain() string parenturl = page.getweburl().getparenturl() string anchor = page.getweburl().getanchor() println("docid: ${docid} ") println("url: ${url} ") println("domain: '${domain}'") println("sub-domain: ' ${subdomain}'") println("path: '${path}'") println("parent page:${parenturl} ") println("anchor text: ${anchor} " ) if (page.getparsedata() instanceof htmlparsedata) { htmlparsedata htmlparsedata = (htmlparsedata) page.getparsedata() string text = htmlparsedata.gettext() string html = htmlparsedata.gethtml() list<weburl> links = htmlparsedata.getoutgoingurls() println("text length: " + text.length()) println("html length: " + html.length()) println("number of outgoing links: " + links.size()) } header[] responseheaders = page.getfetchresponseheaders() if (responseheaders != null) { println("response headers:") (header header : responseheaders) { println("\t ${header.getname()} : ${header.getvalue()}") } } println("=============") } }
grails groovy crawler4j
No comments:
Post a Comment