#! /Usr/bin/Python
#-*-Coding: UTF-8 -*-
# Simple collection Crawler
#1. Collect Yahoo! Modify the answers and parsedata functions to collect data from any website.
#2. sqlite3 or pysqlite is required
#3. It can run on the dreamhost.com Space
#4. You can modify the User-Agent to impersonate a search engine spider.
#5. You can set the pause time to control the collection speed.
#6. Collection of Yahoo IP addresses will be blocked for several hours, so this collection is of little use
# Author: Lukin <mylukin@gmail.com>
# Date: 2008-09-25
# Modules required for import and collection
Import re, sys, time
Import httplib, OS. path as OSP
From urlparse import urlparse
# The sqite database can only be written in this way to be compatible with the space of dreamhost.com.
Try:
Import sqlite3 as SQLite
Failed t importerror:
From pysqlite2 import dbapi2 as SQLite
# Acquisition speed control, in seconds
Sleep = 0
# Database path
Dbname = './database. db'
# Set the submitted Header
Headers = {"accept": "*/*", "Referer": "http://answers.yahoo.com/", "User-Agent": "Mozilla/5.0 + (compatible; + googlebot/2.1; + http://www.google.com/bot.html )"}
# Connecting to the server
DL = httplib. httpconnection ('answers .yahoo.com ')
# Connecting to a database
Conn = SQLite. Connect (OSP. abspath (dbname ))
# Creating a database
Def createdatabase ():
Global Conn, dbname;
If OSP. isfile (OSP. abspath (dbname): Return
C = conn. cursor ()
# Create a table for storing the URL list
C.exe cute ('''create table if not exists [collect] ([CID] integer primary key, [curl] Text, [State] integer default '0 ', unique ([curl]); ''')
C.exe cute (''' create index if not exists [collect_idx_state] on [collect] ([State]); ''')
# Create a Category Table
C.exe cute (''' create table if not exists [sorts] ([sortid] integer primary key, [sortname] Text, [sortpath] text, [sortfoot] integer default '0', [sortnum] integer default '0', unique ([sortpath]); ''')
C.exe cute (''' create index if not exists [sorts_idx_sortname] on [sorts] ([sortname]); ''')
C.exe cute (''' create index if not exists [sorts_idx_sortfoot] on [sorts] ([sortfoot]); ''')
# Create Article Table
C.exe cute ('''create table if not exists [Article] ([aid] integer primary key, [sortid] integer default '0', [Hits] integer default '0 ', [title] Text, [path] Text, [question] Text, [banswer] Text, [oanswer] Text, unique ([path]); ''')
C.exe cute (''' create index if not exists [article_idx_sortid] on [Article] ([sortid]); ''')
# Transaction submission
Conn. Commit ()
C. Close ()
# Perform collection
Def collect (url = "http://answers.yahoo.com /"):
Global DL, error, headers; r = 0
Print "Get:", URL
URLs = urlparse (URL); Path = URLs [2];
If URLs [4]! = '': Path + = '? '+ URLs [4]
DL. Request (method = "get", url = path, headers = headers); RS = DL. getresponse ()
If Rs. Status = 200:
R = parsedata (Rs. Read (), URL );
Else:
Print "3 seconds, try again..."; time. Sleep (3)
DL. Request (method = "get", url = path, headers = headers); RS = DL. getresponse ()
If Rs. Status = 200:
R = parsedata (Rs. Read (), URL );
Else:
Print "3 seconds, try again..."; time. Sleep (3)
DL. Request (method = "get", url = path, headers = headers); RS = DL. getresponse ()
If Rs. Status = 200:
R = parsedata (Rs. Read (), URL );
Else:
Print "continue to collect ..."
R = 3
# Update records
Updateoneurl (URL, R)
# Returned results
Return R
# Process collected data
Def parsedata (HTML, URL ):
Global DL, Conn; r = 2;
C = conn. cursor ()
# Formatting html Code
Format = formaturl (clearblank (HTML), URL)
# Retrieve all connections
URLs = Re. findall (r''' (<A [^>] *? Href = "([^"] +) "[^>] *?>) | (<A [^>] *? Href = '([^'] +) '[^>] *?>) ''', Format, re. I)
If URLs! = None:
I = 0
# Loop all connections
For regs in URLs:
# Get a single URL
Surl = en2chr (regs [1]. Strip ())
# Determine whether the URL meets the rules. If yes, the database is inserted.
If Re. Search ('HTTP (.*?) /(DIR | question)/index (.*?) ', Surl, re. I )! = None:
If Re. Search ('HTTP (.*?) /DIR/index (.*?) ', Surl, re. I )! = None:
If Surl. Find ('link = list') =-1 and Surl. Find ('link = over') =-1:
Surl + = '& link = over'
Else:
Surl = Surl. Replace ('link = list', 'link = over ')
If Surl [-11:] = 'link = mailto': continue
Try:
C.exe cute ('insert into [collect] ([curl]) values (?); ', (Surl ,))
I = I + 1
Failed t SQLite. integrityerror:
Pass
If I> 0: Print "message: % d get a new URL." % (I ,)
# Intercepting data
If Re. Search ('HTTP (. *)/question/index (. *) ', URL, re. I )! = None:
Sortfoot = 0
# Automatically create a category and category relationship
Guide = sect (format, '<ol id = "Yan-Breadcrumbs">', '</OL>', '(<li> (.*?) Home (.*?) </LI> )')
Aguide = Re. findall ('<A [^>] * href = "[^"] * "[^>] *> (.*?) </A> ', guide, re. I)
If aguide! = None:
Sortname = ""
For sortname in aguide:
Sortname = sortname. Strip ()
Sortpath = en2path (sortname)
# Querying whether a category exists
C.exe cute ('select [sortid], [sortname] from [sorts] Where [sortpath] =? Limit 0, 1; ', (sortpath ,))
Row = C. fetchone ();
# Category does not exist. Add category
If ROW = none:
C.exe cute ('insert into [sorts] ([sortname], [sortpath], [sortfoot]) values (?,?,?); ', (Sortname, sortpath, sortfoot ))
Sortfoot = C. lastrowid
Else:
Sortfoot = row [0]
# Title
Title = sect (format, '<H1 class = "subject">', '# Optimal answer
Bestanswer = sect (format, '(<H2> <span> best answer </span> (.*?) </H2> (.*?) <Div class = "content">) ',' (</div> )')
# If the best answer does not exist, no data is collected.
If bestanswer! = None:
# Article path
Path = en2path (sortname + '-' + title. Strip ())
# Problem
Adddata = sect (format, '<Div class = "Additional-details">', '</div> ')
Content = sect (format, '(<H1 class = "subject"> (.*?) <Div class = "content">) ',' (</div> )')
If adddata! = None: content + = '<br/>' + adddata
# Other answers
Otheranswer =''
For regs In Re. findall ('<Div class = "QA-container"> (. + ?) <Div class = "utils-container"> ', format ):
If regs. Find ('<H2>') =-1 and regs. Find ('</H2>') =-1:
A1 = sect (regs, '<Div class = "content">', '</div> ')
A2 = sect (regs, '<Div class = "Reference">', '</div> ')
Otheranswer + = '<Div class = "oanswer">' + A1
If A2! = None: otheranswer + = '<Div class = "Reference">' + A2 + '</div>'
Otheranswer + = '</div>'
# Collection successful judgment
If title! = None and content! = None:
# Writing data to data
Try:
C.exe cute ('insert into [Article] ([sortid], [title], [path], [question], [banswer], [oanswer]) values (?,?,?,?,?,?); ', (Sortfoot, title, path, content, bestanswer, otheranswer ))
Print "message: Rule s.html" % (path ,)
R = 1
Failed t SQLite. integrityerror:
Pass
# Submit for writing data to the database
Conn. Commit (); C. Close ()
Return R
# Obtain a URL
Def getoneurl ():
Global conn; C = conn. cursor ()
C.exe cute ('select [curl] from [collect] Where [State] in () limit ;')
Row = C. fetchone (); C. Close ()
If ROW = none: Return ""
Return row [0]. encode ('utf-8 ')
# Update the status of a record
Def updateoneurl (URL, State ):
Global conn; C = conn. cursor ()
C.exe cute ('Update [collect] Set [State] =? Where [curl] = ?; ', (State, URL ))
Conn. Commit (); C. Close ()
# Clear unnecessary spaces in HTML code
Def clearblank (HTML ):
If Len (HTML) = 0: Return''
Html = Re. sub ('\ r | \ n | \ t', '', HTML)
While html. Find ("")! =-1 or HTML. Find ('& nbsp ;')! =-1:
Html = html. Replace ('& nbsp;', ''). Replace ('','')
Return html
# format a URL
def formaturl (HTML, URL):
URLs = Re. findall (''' (] *? Href = "([^"] +) "[^>] *?>) | (] *? Href = '([^'] +) '[^>] *?>) ''', HTML, re. i)
If URLs = none: Return HTML
for regs in URLs:
html = html. replace (regs [0], matchurl (regs [0], URL)
return HTML
# Format a single URL
Def matchurl (TAG, URL ):
URLs = Re. findall (''' (. *) (SRC | href) = (. + ?) (|/> |>). * | (. *) URL \ ([^ \)] +) \) ''', Tag, re. I)
If URLs = none:
Return tag
Else:
If URLs [0] [5] = '':
Urlquote = URLs [0] [2]
Else:
Urlquote = URLs [0] [5]
If Len (urlquote)> 0:
Curl = Re. sub (''' ['"] ''','', urlquote)
Else:
Return tag
URLs = urlparse (URL); Scheme = URLs [0];
If scheme! = '': Scheme + = '://'
Host = URLs [1]; host = scheme + host
If Len (host) = 0: Return tag
Path = OSP. dirname (URLs [2]);
If Path = '/': Path = '';
If curl. Find ("#")! =-1: curl = curl [: curl. Find ("#")]
# Judgment type
If Re. Search (''' ^ (HTTP | HTTPS | FTP) :( // | \\\\) ([\ W/\\\+ \-~ '@: %]) + \.) + ([\ W/\. \ = \? \ + \-~ '@':! % #] | (& Amp;) | &) + ''', curl, re. I )! = None:
# Skip the URL type starting with HTTP
Return tag
Elif curl [: 1] = '/':
# Absolute path
Curl = Host + curl
Elif curl [: 3] = '../':
# Relative Path
While curl [: 3] = '../':
Curl = curl [3:]
If Len (PATH)> 0:
Path = OSP. dirname (PATH)
Elif curl [: 2] = './':
Curl = Host + path + curl [1:]
Elif curl. Lower () [: 7] = 'mailto: 'or curl. Lower () [: 11] = 'javascript :':
Return tag
Else:
Curl = Host + path + '/' + curl
R = tag. Replace (urlquote, '"' + curl + '"')
Return R
# HTML code truncation Function
Def sect (HTML, start, end, CLS = ''):
If Len (HTML) = 0: return;
# Regular Expression Truncation
If start [: 1] = CHR (40) and start [-1:] = CHR (41) and end [: 1] = CHR (40) and end [-1:] = CHR (41 ):
Rehtml = Re. Search (start + '(.*?) '+ End, HTML, re. I)
If rehtml = none: Return
Rehtml = rehtml. Group ()
Intstart = Re. Search (START, rehtml, re. I). End ()
Intend = Re. Search (end, rehtml, re. I). Start ()
R = rehtml [intstart: intend]
# String Truncation
Else:
# Obtain the start string position
Intstart = html. Lower (). Find (start. Lower ())
# If the start string cannot be found, null is directly returned.
If intstart =-1: Return
# Obtain the position of the end string
Intend = html [intstart + Len (start):]. Lower (). Find (end. Lower ())
# If the end string cannot be found, the return value is null.
If intend =-1: Return
# Start and end strings are available. You can start intercepting them.
R = html [intstart + Len (start): intstart + intend + Len (start)]
# Clear content
If CLS! = '':
R = clear (R, CLS)
# Return truncated characters
Return R
# Regular Expression clearing
Def clear (HTML, regexs ):
If regexs = '': Return html
For RegEx in regexs. Split (CHR (10 )):
RegEx = RegEx. Strip ()
If RegEx! = '':
If RegEx [: 1] = CHR (40) and RegEx [-1:] = CHR (41 ):
Html = Re. sub (RegEx, '', HTML, re. I | re. s)
Else:
Html = html. Replace (RegEx ,'')
Return html
# Format as a path
Def en2path (enstr ):
Return re. sub ('[\ W] +', '-', en2chr (enstr), Re. I | re. U). Strip ('-')
# Replacing an object with a normal character
Def en2chr (enstr ):
Return enstr. Replace ('& amp ;','&')
# ------------------------------------- Start executionProgram-------------------------------------------
# First create a database
Createdatabase ()
# Start collection
Loops = 0
While true:
If loops> 0:
Url = getoneurl ()
If url = "":
Loops = 0
Else:
Loops = collect (URL)
Else:
Loops = collect ()
# Pause
Time. Sleep (sleep)
If loops = 0: Break
# Closing the HTTP Connection
DL. Close ()
# Exit the program
SYS. Exit ()
This article references http://www.cnblogs.com/kuyuecs/archive/2008/10/15/1311346.html