Python crawler first lesson, making search engines

Last Update:2015-05-25 Source: Internet

Author: User

Tags create index sqlite

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

From BeautifulSoup import *from urlparse import urljoinignaorewords=set ([' The ', ' of ', ' to ', ' and ', ' a ', '-', ' is ', ' it ')

Our search engine is based on key words, so the conjunctions, the articles ignore

The following code is a crawler, the text of the Web page data stored in our SQLite, we do not understand it is not related to know what these functions are doing on the line

From sqlite3 import dbapi2 as Sqliteimport urllib2class crawler:def __init__ (self,dbname): Self.con=sqlite.conn        ECT (dbname) #连接并建立数据库, dbname casual, ' xxx.db ' can be Def __del__ (self): Self.con.close () def dbcommit (self):            Self.con.commit () def Getentryid (self,table,field,value,createnew=true): Cur=self.con.execute (            "Select rowID from%s where%s= '%s '"% (Table,field,value)) Res=cur.fetchone () if Res==none: Cur=self.con.execute ("insert into%s (%s) values ('%s ')"% (Table,field,value)) return Cur.last rowID Else:return res[0] def addtoindex (self,url,soup): if Self.isindexed (URL): retur n print ' indexing ', url #Get words text=self.gettextonly (soup) words=self.separatewords        (text) #Get URL id urlid=self.getentryid (' urllist ', ' url ', url) # Link Word to URL For I in range (len (woRDS)): Word=words[i] If Word in ignaorewords:continue wordid=self.getentryid (' wordlist ', ' word ', word) self.con.execute ("INSERT into wordlocation (urlid,wordid,location) VALUES (%d,%d,%d)"%                                (Urlid,wordid,i))            def gettextonly (self,soup): v=soup.string if V==none:c=soup.contents resulttext= " For T in C:subtext=self.gettextonly (t) resulttext+=subtext+ ' \ n ' return Resulttext Else:return V.strip () def separatewords (Self,text): Splitter=re.compile (' \\w* ' return [S.lower () for S in Splitter.split (text) if s!= "] def isindexed (self,url): U=self.con.execu            Te ("Select rowID from Urllist where url= '%s '"% URL). Fetchone () If U!=none: #if crawled V=self.con.execute (' select * from wordlocation where urlid=%d '% u[0]). FETChone () if v! = None:return True return False def addlinkref (self,urlfrom,urlto,linktext): Pass def Crawl (self,pages,depth=2): For I in range (depth): Newpages=set () for page I n Pages:try:c=urllib2.urlopen (page) Except:print "C Ould not open ", page continue Soup=beautifulsoup (C.read ()) Self.addtoindex (page,soup) Links=soup (' a ') for link in links:if ' href '                            In Dict (link.attrs): Url=urljoin (page,link[' href ')) if Url.find ("'")! =-1:                        Continue Url=url.split (' # ') [0] #remove location portion                            If url[0:4]== ' http ' and not self.isindexed (URL): Newpages.add (URL) Linktext=self.gettExtonly (link) self.addlinkref (page,url,linktext) self.dbcommit () page S=newpages def createindextables (self): Self.con.execute (' CREATE table urllist (URL) ') self.co        N.execute (' CREATE TABLE wordlist (word) ') self.con.execute (' CREATE TABLE wordlocation (urlid,wordid,location) ') Self.con.execute (' CREATE TABLE link (fromid integer,toid integer) ') self.con.execute (' CREATE TABLE Linkwords (Wordi D,linid) Self.con.execute (' CREATE index wordidx on wordlist (word) ') self.con.execute (' CREATE INDEX Urlidx On urllist (URL) ') self.con.execute (' CREATE index wordurlidx on wordlocation (wordid) ') Self.con.execute (' Crea Te index urltoidx on link (toid) ') self.con.execute (' CREATE index urlfromidx on link (fromid) ') Self.dbcommit ( )

Well, with the crawler, we'll write out the pages we need to crawl.

pagelist=[[' http://en.xjtu.edu.cn/'],          [' http://www.lib.xjtu.edu.cn/'],          [' http://en.wikipedia.org/wiki/ Xi%27an_jiaotong_university ']

Set up a database

Mycrawler=crawler (' searchindex.db ') mycrawler.createindextables ()

Crawl

Mycrawler.crawl (Pagelist[0])

Search engine

Class Searcher:def __init__ (self,dbname): Self.con=sqlite.connect (dbname) def __del__ (self): self . Con.close () def getmatchrows (self,q): # Strings to build the query fieldlist= ' w0.urlid ' table list= ' clauselist= ' wordids=[] # Split The words by Spaces Words=q.split (') tab Lenumber=0 for word in words: #Get the word ID wordrow=self.con.execute ("Sele                CT rowid from wordlist where word= '%s ' "% word). Fetchone () if wordrow!=none:wordid=wordrow[0]                    Wordids.append (Wordid) If tablenumber>0:tablelist+= ', '                clauselist+= ' and ' clauselist+= ' w%d.urlid=w%d.urlid and '% (Tablenumber-1,tablenumber)                      fieldlist+= ', w%d.location '% tablenumber tablelist+= ' wordlocation w%d '% tablenumber clauselist+= ' W%d.wordid=%d '% (tablenumber,wordid) tablenumber+=1 # Create the query from the separate parts full Query= ' Select%s from%s where%s '% (fieldlist,tablelist,clauselist) print Fullquery Cur=self.con.execute (f Ullquery) Rows=[row for row in cur] return rows,wordids def geturlname (self,id): Return SELF.C On.execute ("Select URL from urllist where rowid=%d"% id). Fetchone () [0] def normaliszescores (self,score s,smallisbetter=0): vsmall=0.00001 if Smallisbetter:minscore=min (Scores.value ()) Retu RN Dict ([(U,float (Minscore)/max (vsmall,l)) for (u,l) in Scores.items ()]) Else:ma Xscore=max (Scores.values ()) If Maxscore==0:maxscore=vsmall return Dict ([(U,float (c)/  Maxscore) for (u,c) in Scores.items ()]) #score methods def frequencyscore (self,rows): Counts=dict ([(row[0],0) for row in rows]) foR row in Rows:counts[row[0]]+=1 return Self.normaliszescores (counts) def Locationscore (self,rows            ): Locations=dict ([(row[0],1000000) for row in rows]) for row in Rows:loc=sum (row[1:]) If Loc<locations[row[0]]: Locations[row[0]]=loc return Self.normaliszescores (LOCATIONS,SMALLISB  etter=1) def distancescore (self,rows): If Len (row[0]) <=2:return dict ([(row[0],1.0) for row in Rows]) mindistance=dict ([(row[0],1000000) for row in rows]) for row in Rows:dist=sum ([ABS (Row[i ]-ROW[I-1]) for I in range (2,len (row))]) if Dist < mindistance[row[0]]: Mindistance[row[0]]=d ist return Self.normaliszescores (mindistance,smallisbetter=1) #-------------------------------------------------- -------------------------def getscoredlist (self,rows,wordids): Totalscores=dict ([(row[0],0) for row in rows ]) weights=[(1.0,self.frequencyscore (rows))]        for (weight,scores) in Weights:for URL in Totalscores:totalscores[url]+=weight*scores[url] return totalscores def query (self,q): Rows,wordids=self.getmatchrows (q) scores=self.getscoredlist (rows,wordids) rankedscores=sorted ([(Score,url) for (Url,score) in Scores.items ()],reverse=1) for (Score,url ID) in rankedscores[:10]: print '%f\t%s '% (Score,self.geturlname (urlid))

Establish the correlation between search engine and database

E=searcher (' searchindex.db ')

E.query (' Xjtu College ')

So your first search engine is built:

1.000000http://en.xjtu.edu.cn/xjtu_introduction/introduction.htm0.941176http://en.xjtu.edu.cn/info/1044/1683. htm0.705882http://en.xjtu.edu.cn/schools_and_colleges.htm0.529412http://en.xjtu.edu.cn/info/1044/1681. htm0.470588http://en.xjtu.edu.cn/education/undergraduate_education.htm0.382353http://en.xjtu.edu.cn/xjtu_news/ news.htm0.382353http://en.xjtu.edu.cn/campus_life/student_bodies.htm0.294118http://en.xjtu.edu.cn/xjtu_news/ teaching_and_learning.htm0.294118http://en.xjtu.edu.cn/info/1044/1572.htm0.279412http://en.xjtu.edu.cn/info/ 1044/1571.htm

Python crawler first lesson, making search engines

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More