標籤:搜尋引擎 爬蟲 python 資料庫 sqlite
from BeautifulSoup import *from urlparse import urljoinignaorewords=set(['the','of','to','and','a','in','is','it'])
我們的搜尋引擎基於關鍵詞, 所以將連詞,冠詞忽略
下面的代碼是爬蟲, 將網頁的文本資料存放區到我們的sqlite中, 大家看不懂也沒有關係, 知道這些函數是幹什麼的就行了
from sqlite3 import dbapi2 as sqliteimport urllib2class crawler: def __init__(self,dbname): self.con=sqlite.connect(dbname) #串連並建立資料庫, dbname 隨意, 'xxx.db'就可以 def __del__(self): self.con.close() def dbcommit(self): self.con.commit() def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" %(table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing',url #Get words text=self.gettextonly(soup) words=self.separatewords(text) #Get URL id urlid=self.getentryid('urllist','url',url) # Link word to url for i in range(len(words)): word=words[i] if word in ignaorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values(%d,%d,%d)" % (urlid,wordid,i)) def gettextonly(self,soup): v=soup.string if v==None: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute( "select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #if crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v != None: return True return False def addlinkref(self,urlFrom,urlTo,linkText): pass def crawl(self,pages,depth=2): for i in range(depth): newpages=set() for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open",page continue soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if 'href' in dict(link.attrs): url=urljoin(page,link['href']) if url.find("'") != -1: continue url=url.split('#')[0] #remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages.add(url) linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() pages=newpages def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit()
好了, 有了爬蟲, 我們再將需要爬取的頁面寫出來
pagelist=[['http://en.xjtu.edu.cn/'], ['http://www.lib.xjtu.edu.cn/'], ['http://en.wikipedia.org/wiki/Xi%27an_Jiaotong_University']]
建立資料庫
mycrawler=crawler('searchindex.db')mycrawler.createindextables()
爬取
mycrawler.crawl(pagelist[0])
搜尋引擎
class searcher: def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def getmatchrows(self,q): # Strings to build the query fieldlist='w0.urlid' tablelist='' clauselist='' wordids=[] # Split the words by spaces words=q.split(' ') tablenumber=0 for word in words: #Get the word ID wordrow=self.con.execute( "select rowid from wordlist where word='%s'" % word).fetchone() if wordrow!=None: wordid=wordrow[0] wordids.append(wordid) if tablenumber>0: tablelist+=',' clauselist+=' and ' clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber) fieldlist+=',w%d.location' % tablenumber tablelist+='wordlocation w%d' % tablenumber clauselist+='w%d.wordid=%d' % (tablenumber,wordid) tablenumber+=1 # Create the query from the separate parts fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist) print fullquery cur=self.con.execute(fullquery) rows=[row for row in cur] return rows,wordids def geturlname(self,id): return self.con.execute( "select url from urllist where rowid=%d" % id).fetchone()[0] def normaliszescores(self,scores,smallIsBetter=0): vsmall=0.00001 if smallIsBetter: minscore=min(scores.value()) return dict([(u,float(minscore)/max(vsmall,l)) for (u,l) in scores.items()]) else: maxscore=max(scores.values()) if maxscore==0: maxscore=vsmall return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])#score methods def frequencyscore(self,rows): counts=dict([(row[0],0) for row in rows]) for row in rows: counts[row[0]]+=1 return self.normaliszescores(counts) def locationscore(self,rows): locations=dict([(row[0],1000000) for row in rows]) for row in rows: loc=sum(row[1:]) if loc<locations[row[0]]: locations[row[0]]=loc return self.normaliszescores(locations,smallIsBetter=1) def distancescore(self,rows): if len(row[0])<=2: return dict([(row[0],1.0) for row in rows]) mindistance=dict([(row[0],1000000) for row in rows]) for row in rows: dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))]) if dist < mindistance[row[0]]: mindistance[row[0]]=dist return self.normaliszescores(mindistance,smallIsBetter=1)#--------------------------------------------------------------------------- def getscoredlist(self,rows,wordids): totalscores=dict([(row[0],0) for row in rows]) weights=[(1.0,self.frequencyscore(rows))] for (weight,scores) in weights: for url in totalscores: totalscores[url]+=weight*scores[url] return totalscores def query(self,q): rows,wordids=self.getmatchrows(q) scores=self.getscoredlist(rows,wordids) rankedscores=sorted([(score,url) for (url,score) in scores.items()],reverse=1) for (score,urlid) in rankedscores[:10]: print '%f\t%s' % (score,self.geturlname(urlid))
建立搜尋引擎與資料庫的關聯
e=searcher('searchindex.db')
搜尋
e.query('xjtu college')
這樣你的第一個搜尋引擎就搭建完畢啦:
1.000000http://en.xjtu.edu.cn/XJTU_Introduction/Introduction.htm0.941176http://en.xjtu.edu.cn/info/1044/1683.htm0.705882http://en.xjtu.edu.cn/Schools_and_Colleges.htm0.529412http://en.xjtu.edu.cn/info/1044/1681.htm0.470588http://en.xjtu.edu.cn/Education/Undergraduate_Education.htm0.382353http://en.xjtu.edu.cn/XJTU_News/News.htm0.382353http://en.xjtu.edu.cn/Campus_Life/Student_Bodies.htm0.294118http://en.xjtu.edu.cn/XJTU_News/Teaching_and_learning.htm0.294118http://en.xjtu.edu.cn/info/1044/1572.htm0.279412http://en.xjtu.edu.cn/info/1044/1571.htm
python爬蟲第一課,製作搜尋引擎