python爬蟲第一課,製作搜尋引擎

最後更新：2015-05-25 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

標籤：搜尋引擎爬蟲 python 資料庫 sqlite

from BeautifulSoup import *from urlparse import urljoinignaorewords=set(['the','of','to','and','a','in','is','it'])

我們的搜尋引擎基於關鍵詞, 所以將連詞,冠詞忽略

下面的代碼是爬蟲, 將網頁的文本資料存放區到我們的sqlite中, 大家看不懂也沒有關係, 知道這些函數是幹什麼的就行了

from sqlite3 import dbapi2 as sqliteimport urllib2class crawler:    def __init__(self,dbname):        self.con=sqlite.connect(dbname)        #串連並建立資料庫, dbname 隨意, 'xxx.db'就可以    def __del__(self):        self.con.close()    def dbcommit(self):        self.con.commit()        def getentryid(self,table,field,value,createnew=True):        cur=self.con.execute(            "select rowid from %s where %s='%s'" %(table,field,value))        res=cur.fetchone()        if res==None:            cur=self.con.execute(                "insert into %s (%s) values ('%s')" % (table,field,value))            return cur.lastrowid        else:            return res[0]            def addtoindex(self,url,soup):        if self.isindexed(url): return        print 'Indexing',url                #Get words        text=self.gettextonly(soup)        words=self.separatewords(text)                #Get URL id        urlid=self.getentryid('urllist','url',url)                # Link word to url        for i in range(len(words)):            word=words[i]            if word in ignaorewords: continue            wordid=self.getentryid('wordlist','word',word)            self.con.execute("insert into wordlocation(urlid,wordid,location)             values(%d,%d,%d)" % (urlid,wordid,i))                                def gettextonly(self,soup):        v=soup.string        if v==None:            c=soup.contents            resulttext=''            for t in c:                subtext=self.gettextonly(t)                resulttext+=subtext+'\n'            return resulttext        else:            return v.strip()        def separatewords(self,text):        splitter=re.compile('\\W*')        return [s.lower() for s in splitter.split(text) if s!='']        def isindexed(self,url):        u=self.con.execute(            "select rowid from urllist where url='%s'" % url).fetchone()        if u!=None:            #if crawled            v=self.con.execute(                'select * from wordlocation where urlid=%d' % u[0]).fetchone()            if v != None: return True        return False        def addlinkref(self,urlFrom,urlTo,linkText):        pass        def crawl(self,pages,depth=2):        for i in range(depth):            newpages=set()            for page in pages:                try:                    c=urllib2.urlopen(page)                except:                    print "Could not open",page                    continue                soup=BeautifulSoup(c.read())                self.addtoindex(page,soup)                                links=soup('a')                for link in links:                    if 'href' in dict(link.attrs):                        url=urljoin(page,link['href'])                        if url.find("'") != -1:                            continue                        url=url.split('#')[0] #remove location portion                        if url[0:4]=='http' and not self.isindexed(url):                            newpages.add(url)                            linkText=self.gettextonly(link)                            self.addlinkref(page,url,linkText)                self.dbcommit()            pages=newpages                def createindextables(self):        self.con.execute('create table urllist(url)')        self.con.execute('create table wordlist(word)')        self.con.execute('create table wordlocation(urlid,wordid,location)')        self.con.execute('create table link(fromid integer,toid integer)')        self.con.execute('create table linkwords(wordid,linid)')        self.con.execute('create index wordidx on wordlist(word)')        self.con.execute('create index urlidx on urllist(url)')        self.con.execute('create index wordurlidx on wordlocation(wordid)')        self.con.execute('create index urltoidx on link(toid)')        self.con.execute('create index urlfromidx on link(fromid)')        self.dbcommit()

好了, 有了爬蟲, 我們再將需要爬取的頁面寫出來

pagelist=[['http://en.xjtu.edu.cn/'],          ['http://www.lib.xjtu.edu.cn/'],          ['http://en.wikipedia.org/wiki/Xi%27an_Jiaotong_University']]

建立資料庫

mycrawler=crawler('searchindex.db')mycrawler.createindextables()

爬取

mycrawler.crawl(pagelist[0])

搜尋引擎

class searcher:    def __init__(self,dbname):        self.con=sqlite.connect(dbname)        def __del__(self):        self.con.close()        def getmatchrows(self,q):        # Strings to build the query        fieldlist='w0.urlid'        tablelist=''          clauselist=''        wordids=[]        # Split the words by spaces        words=q.split(' ')          tablenumber=0        for word in words:            #Get the word ID            wordrow=self.con.execute(                "select rowid from wordlist where word='%s'" % word).fetchone()            if wordrow!=None:                wordid=wordrow[0]                wordids.append(wordid)                if tablenumber>0:                    tablelist+=','                    clauselist+=' and '                    clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber)                fieldlist+=',w%d.location' % tablenumber                tablelist+='wordlocation w%d' % tablenumber                      clauselist+='w%d.wordid=%d' % (tablenumber,wordid)                tablenumber+=1        # Create the query from the separate parts        fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist)        print fullquery        cur=self.con.execute(fullquery)        rows=[row for row in cur]        return rows,wordids        def geturlname(self,id):        return self.con.execute(            "select url from urllist where rowid=%d" % id).fetchone()[0]        def normaliszescores(self,scores,smallIsBetter=0):        vsmall=0.00001        if smallIsBetter:            minscore=min(scores.value())            return dict([(u,float(minscore)/max(vsmall,l)) for (u,l)                        in scores.items()])        else:            maxscore=max(scores.values())            if maxscore==0:                maxscore=vsmall            return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])#score methods    def frequencyscore(self,rows):        counts=dict([(row[0],0) for row in rows])        for row in rows:            counts[row[0]]+=1        return self.normaliszescores(counts)        def locationscore(self,rows):        locations=dict([(row[0],1000000) for row in rows])        for row in rows:            loc=sum(row[1:])            if loc<locations[row[0]]:                locations[row[0]]=loc        return self.normaliszescores(locations,smallIsBetter=1)        def distancescore(self,rows):        if len(row[0])<=2:            return dict([(row[0],1.0) for row in rows])        mindistance=dict([(row[0],1000000) for row in rows])        for row in rows:            dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))])            if dist < mindistance[row[0]]:                mindistance[row[0]]=dist        return self.normaliszescores(mindistance,smallIsBetter=1)#---------------------------------------------------------------------------        def getscoredlist(self,rows,wordids):        totalscores=dict([(row[0],0) for row in rows])                weights=[(1.0,self.frequencyscore(rows))]                for (weight,scores) in weights:            for url in totalscores:                totalscores[url]+=weight*scores[url]        return totalscores        def query(self,q):        rows,wordids=self.getmatchrows(q)        scores=self.getscoredlist(rows,wordids)        rankedscores=sorted([(score,url) for (url,score) in scores.items()],reverse=1)        for (score,urlid) in rankedscores[:10]:            print '%f\t%s' % (score,self.geturlname(urlid))

建立搜尋引擎與資料庫的關聯

e=searcher('searchindex.db')

搜尋

e.query('xjtu college')

這樣你的第一個搜尋引擎就搭建完畢啦:

1.000000http://en.xjtu.edu.cn/XJTU_Introduction/Introduction.htm0.941176http://en.xjtu.edu.cn/info/1044/1683.htm0.705882http://en.xjtu.edu.cn/Schools_and_Colleges.htm0.529412http://en.xjtu.edu.cn/info/1044/1681.htm0.470588http://en.xjtu.edu.cn/Education/Undergraduate_Education.htm0.382353http://en.xjtu.edu.cn/XJTU_News/News.htm0.382353http://en.xjtu.edu.cn/Campus_Life/Student_Bodies.htm0.294118http://en.xjtu.edu.cn/XJTU_News/Teaching_and_learning.htm0.294118http://en.xjtu.edu.cn/info/1044/1572.htm0.279412http://en.xjtu.edu.cn/info/1044/1571.htm

python爬蟲第一課,製作搜尋引擎

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More