python爬蟲第一課,製作搜尋引擎

來源:互聯網
上載者:User

標籤:搜尋引擎   爬蟲   python   資料庫   sqlite   

from BeautifulSoup import *from urlparse import urljoinignaorewords=set(['the','of','to','and','a','in','is','it'])

我們的搜尋引擎基於關鍵詞, 所以將連詞,冠詞忽略


下面的代碼是爬蟲, 將網頁的文本資料存放區到我們的sqlite中, 大家看不懂也沒有關係, 知道這些函數是幹什麼的就行了

from sqlite3 import dbapi2 as sqliteimport urllib2class crawler:    def __init__(self,dbname):        self.con=sqlite.connect(dbname)        #串連並建立資料庫, dbname 隨意, 'xxx.db'就可以    def __del__(self):        self.con.close()    def dbcommit(self):        self.con.commit()        def getentryid(self,table,field,value,createnew=True):        cur=self.con.execute(            "select rowid from %s where %s='%s'" %(table,field,value))        res=cur.fetchone()        if res==None:            cur=self.con.execute(                "insert into %s (%s) values ('%s')" % (table,field,value))            return cur.lastrowid        else:            return res[0]            def addtoindex(self,url,soup):        if self.isindexed(url): return        print 'Indexing',url                #Get words        text=self.gettextonly(soup)        words=self.separatewords(text)                #Get URL id        urlid=self.getentryid('urllist','url',url)                # Link word to url        for i in range(len(words)):            word=words[i]            if word in ignaorewords: continue            wordid=self.getentryid('wordlist','word',word)            self.con.execute("insert into wordlocation(urlid,wordid,location)             values(%d,%d,%d)" % (urlid,wordid,i))                                def gettextonly(self,soup):        v=soup.string        if v==None:            c=soup.contents            resulttext=''            for t in c:                subtext=self.gettextonly(t)                resulttext+=subtext+'\n'            return resulttext        else:            return v.strip()        def separatewords(self,text):        splitter=re.compile('\\W*')        return [s.lower() for s in splitter.split(text) if s!='']        def isindexed(self,url):        u=self.con.execute(            "select rowid from urllist where url='%s'" % url).fetchone()        if u!=None:            #if crawled            v=self.con.execute(                'select * from wordlocation where urlid=%d' % u[0]).fetchone()            if v != None: return True        return False        def addlinkref(self,urlFrom,urlTo,linkText):        pass        def crawl(self,pages,depth=2):        for i in range(depth):            newpages=set()            for page in pages:                try:                    c=urllib2.urlopen(page)                except:                    print "Could not open",page                    continue                soup=BeautifulSoup(c.read())                self.addtoindex(page,soup)                                links=soup('a')                for link in links:                    if 'href' in dict(link.attrs):                        url=urljoin(page,link['href'])                        if url.find("'") != -1:                            continue                        url=url.split('#')[0] #remove location portion                        if url[0:4]=='http' and not self.isindexed(url):                            newpages.add(url)                            linkText=self.gettextonly(link)                            self.addlinkref(page,url,linkText)                self.dbcommit()            pages=newpages                def createindextables(self):        self.con.execute('create table urllist(url)')        self.con.execute('create table wordlist(word)')        self.con.execute('create table wordlocation(urlid,wordid,location)')        self.con.execute('create table link(fromid integer,toid integer)')        self.con.execute('create table linkwords(wordid,linid)')        self.con.execute('create index wordidx on wordlist(word)')        self.con.execute('create index urlidx on urllist(url)')        self.con.execute('create index wordurlidx on wordlocation(wordid)')        self.con.execute('create index urltoidx on link(toid)')        self.con.execute('create index urlfromidx on link(fromid)')        self.dbcommit()
好了, 有了爬蟲, 我們再將需要爬取的頁面寫出來

pagelist=[['http://en.xjtu.edu.cn/'],          ['http://www.lib.xjtu.edu.cn/'],          ['http://en.wikipedia.org/wiki/Xi%27an_Jiaotong_University']]
建立資料庫

mycrawler=crawler('searchindex.db')mycrawler.createindextables()
爬取

mycrawler.crawl(pagelist[0])
搜尋引擎

class searcher:    def __init__(self,dbname):        self.con=sqlite.connect(dbname)        def __del__(self):        self.con.close()        def getmatchrows(self,q):        # Strings to build the query        fieldlist='w0.urlid'        tablelist=''          clauselist=''        wordids=[]        # Split the words by spaces        words=q.split(' ')          tablenumber=0        for word in words:            #Get the word ID            wordrow=self.con.execute(                "select rowid from wordlist where word='%s'" % word).fetchone()            if wordrow!=None:                wordid=wordrow[0]                wordids.append(wordid)                if tablenumber>0:                    tablelist+=','                    clauselist+=' and '                    clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber)                fieldlist+=',w%d.location' % tablenumber                tablelist+='wordlocation w%d' % tablenumber                      clauselist+='w%d.wordid=%d' % (tablenumber,wordid)                tablenumber+=1        # Create the query from the separate parts        fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist)        print fullquery        cur=self.con.execute(fullquery)        rows=[row for row in cur]        return rows,wordids        def geturlname(self,id):        return self.con.execute(            "select url from urllist where rowid=%d" % id).fetchone()[0]        def normaliszescores(self,scores,smallIsBetter=0):        vsmall=0.00001        if smallIsBetter:            minscore=min(scores.value())            return dict([(u,float(minscore)/max(vsmall,l)) for (u,l)                        in scores.items()])        else:            maxscore=max(scores.values())            if maxscore==0:                maxscore=vsmall            return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])#score methods    def frequencyscore(self,rows):        counts=dict([(row[0],0) for row in rows])        for row in rows:            counts[row[0]]+=1        return self.normaliszescores(counts)        def locationscore(self,rows):        locations=dict([(row[0],1000000) for row in rows])        for row in rows:            loc=sum(row[1:])            if loc<locations[row[0]]:                locations[row[0]]=loc        return self.normaliszescores(locations,smallIsBetter=1)        def distancescore(self,rows):        if len(row[0])<=2:            return dict([(row[0],1.0) for row in rows])        mindistance=dict([(row[0],1000000) for row in rows])        for row in rows:            dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))])            if dist < mindistance[row[0]]:                mindistance[row[0]]=dist        return self.normaliszescores(mindistance,smallIsBetter=1)#---------------------------------------------------------------------------        def getscoredlist(self,rows,wordids):        totalscores=dict([(row[0],0) for row in rows])                weights=[(1.0,self.frequencyscore(rows))]                for (weight,scores) in weights:            for url in totalscores:                totalscores[url]+=weight*scores[url]        return totalscores        def query(self,q):        rows,wordids=self.getmatchrows(q)        scores=self.getscoredlist(rows,wordids)        rankedscores=sorted([(score,url) for (url,score) in scores.items()],reverse=1)        for (score,urlid) in rankedscores[:10]:            print '%f\t%s' % (score,self.geturlname(urlid))
建立搜尋引擎與資料庫的關聯

e=searcher('searchindex.db')
搜尋

e.query('xjtu college')
這樣你的第一個搜尋引擎就搭建完畢啦:

1.000000http://en.xjtu.edu.cn/XJTU_Introduction/Introduction.htm0.941176http://en.xjtu.edu.cn/info/1044/1683.htm0.705882http://en.xjtu.edu.cn/Schools_and_Colleges.htm0.529412http://en.xjtu.edu.cn/info/1044/1681.htm0.470588http://en.xjtu.edu.cn/Education/Undergraduate_Education.htm0.382353http://en.xjtu.edu.cn/XJTU_News/News.htm0.382353http://en.xjtu.edu.cn/Campus_Life/Student_Bodies.htm0.294118http://en.xjtu.edu.cn/XJTU_News/Teaching_and_learning.htm0.294118http://en.xjtu.edu.cn/info/1044/1572.htm0.279412http://en.xjtu.edu.cn/info/1044/1571.htm









python爬蟲第一課,製作搜尋引擎

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.