Python simple collector

Source: Internet
Author: User
Tags return tag

#! /Usr/bin/Python
#-*-Coding: UTF-8 -*-

# Simple collection Crawler
#1. Collect Yahoo! Modify the answers and parsedata functions to collect data from any website.
#2. sqlite3 or pysqlite is required
#3. It can run on the dreamhost.com Space
#4. You can modify the User-Agent to impersonate a search engine spider.
#5. You can set the pause time to control the collection speed.
#6. Collection of Yahoo IP addresses will be blocked for several hours, so this collection is of little use
# Author: Lukin <mylukin@gmail.com>
# Date: 2008-09-25

# Modules required for import and collection
Import re, sys, time
Import httplib, OS. path as OSP
From urlparse import urlparse
# The sqite database can only be written in this way to be compatible with the space of dreamhost.com.
Try:
Import sqlite3 as SQLite
Failed t importerror:
From pysqlite2 import dbapi2 as SQLite

# Acquisition speed control, in seconds
Sleep = 0
# Database path
Dbname = './database. db'
# Set the submitted Header
Headers = {"accept": "*/*", "Referer": "http://answers.yahoo.com/", "User-Agent": "Mozilla/5.0 + (compatible; + googlebot/2.1; + http://www.google.com/bot.html )"}
# Connecting to the server
DL = httplib. httpconnection ('answers .yahoo.com ')
# Connecting to a database
Conn = SQLite. Connect (OSP. abspath (dbname ))

# Creating a database
Def createdatabase ():
Global Conn, dbname;
If OSP. isfile (OSP. abspath (dbname): Return
C = conn. cursor ()
# Create a table for storing the URL list
C.exe cute ('''create table if not exists [collect] ([CID] integer primary key, [curl] Text, [State] integer default '0 ', unique ([curl]); ''')
C.exe cute (''' create index if not exists [collect_idx_state] on [collect] ([State]); ''')
# Create a Category Table
C.exe cute (''' create table if not exists [sorts] ([sortid] integer primary key, [sortname] Text, [sortpath] text, [sortfoot] integer default '0', [sortnum] integer default '0', unique ([sortpath]); ''')
C.exe cute (''' create index if not exists [sorts_idx_sortname] on [sorts] ([sortname]); ''')
C.exe cute (''' create index if not exists [sorts_idx_sortfoot] on [sorts] ([sortfoot]); ''')
# Create Article Table
C.exe cute ('''create table if not exists [Article] ([aid] integer primary key, [sortid] integer default '0', [Hits] integer default '0 ', [title] Text, [path] Text, [question] Text, [banswer] Text, [oanswer] Text, unique ([path]); ''')
C.exe cute (''' create index if not exists [article_idx_sortid] on [Article] ([sortid]); ''')
# Transaction submission
Conn. Commit ()
C. Close ()

# Perform collection
Def collect (url = "http://answers.yahoo.com /"):
Global DL, error, headers; r = 0
Print "Get:", URL
URLs = urlparse (URL); Path = URLs [2];
If URLs [4]! = '': Path + = '? '+ URLs [4]
DL. Request (method = "get", url = path, headers = headers); RS = DL. getresponse ()
If Rs. Status = 200:
R = parsedata (Rs. Read (), URL );
Else:
Print "3 seconds, try again..."; time. Sleep (3)
DL. Request (method = "get", url = path, headers = headers); RS = DL. getresponse ()
If Rs. Status = 200:
R = parsedata (Rs. Read (), URL );
Else:
Print "3 seconds, try again..."; time. Sleep (3)
DL. Request (method = "get", url = path, headers = headers); RS = DL. getresponse ()
If Rs. Status = 200:
R = parsedata (Rs. Read (), URL );
Else:
Print "continue to collect ..."
R = 3
# Update records
Updateoneurl (URL, R)
# Returned results
Return R

# Process collected data
Def parsedata (HTML, URL ):
Global DL, Conn; r = 2;
C = conn. cursor ()
# Formatting html Code
Format = formaturl (clearblank (HTML), URL)
# Retrieve all connections
URLs = Re. findall (r''' (<A [^>] *? Href = "([^"] +) "[^>] *?>) | (<A [^>] *? Href = '([^'] +) '[^>] *?>) ''', Format, re. I)
If URLs! = None:
I = 0
# Loop all connections
For regs in URLs:
# Get a single URL
Surl = en2chr (regs [1]. Strip ())
# Determine whether the URL meets the rules. If yes, the database is inserted.
If Re. Search ('HTTP (.*?) /(DIR | question)/index (.*?) ', Surl, re. I )! = None:
If Re. Search ('HTTP (.*?) /DIR/index (.*?) ', Surl, re. I )! = None:
If Surl. Find ('link = list') =-1 and Surl. Find ('link = over') =-1:
Surl + = '& link = over'
Else:
Surl = Surl. Replace ('link = list', 'link = over ')
If Surl [-11:] = 'link = mailto': continue
Try:
C.exe cute ('insert into [collect] ([curl]) values (?); ', (Surl ,))
I = I + 1
Failed t SQLite. integrityerror:
Pass
If I> 0: Print "message: % d get a new URL." % (I ,)
# Intercepting data
If Re. Search ('HTTP (. *)/question/index (. *) ', URL, re. I )! = None:
Sortfoot = 0
# Automatically create a category and category relationship
Guide = sect (format, '<ol id = "Yan-Breadcrumbs">', '</OL>', '(<li> (.*?) Home (.*?) </LI> )')
Aguide = Re. findall ('<A [^>] * href = "[^"] * "[^>] *> (.*?) </A> ', guide, re. I)
If aguide! = None:
Sortname = ""
For sortname in aguide:
Sortname = sortname. Strip ()
Sortpath = en2path (sortname)
# Querying whether a category exists
C.exe cute ('select [sortid], [sortname] from [sorts] Where [sortpath] =? Limit 0, 1; ', (sortpath ,))
Row = C. fetchone ();
# Category does not exist. Add category
If ROW = none:
C.exe cute ('insert into [sorts] ([sortname], [sortpath], [sortfoot]) values (?,?,?); ', (Sortname, sortpath, sortfoot ))
Sortfoot = C. lastrowid
Else:
Sortfoot = row [0]
# Title
Title = sect (format, '<H1 class = "subject">', '# Optimal answer
Bestanswer = sect (format, '(<H2> <span> best answer </span> (.*?) </H2> (.*?) <Div class = "content">) ',' (</div> )')
# If the best answer does not exist, no data is collected.
If bestanswer! = None:
# Article path
Path = en2path (sortname + '-' + title. Strip ())
# Problem
Adddata = sect (format, '<Div class = "Additional-details">', '</div> ')
Content = sect (format, '(<H1 class = "subject"> (.*?) <Div class = "content">) ',' (</div> )')
If adddata! = None: content + = '<br/>' + adddata
# Other answers
Otheranswer =''
For regs In Re. findall ('<Div class = "QA-container"> (. + ?) <Div class = "utils-container"> ', format ):
If regs. Find ('<H2>') =-1 and regs. Find ('</H2>') =-1:
A1 = sect (regs, '<Div class = "content">', '</div> ')
A2 = sect (regs, '<Div class = "Reference">', '</div> ')
Otheranswer + = '<Div class = "oanswer">' + A1
If A2! = None: otheranswer + = '<Div class = "Reference">' + A2 + '</div>'
Otheranswer + = '</div>'
# Collection successful judgment
If title! = None and content! = None:
# Writing data to data
Try:
C.exe cute ('insert into [Article] ([sortid], [title], [path], [question], [banswer], [oanswer]) values (?,?,?,?,?,?); ', (Sortfoot, title, path, content, bestanswer, otheranswer ))
Print "message: Rule s.html" % (path ,)
R = 1
Failed t SQLite. integrityerror:
Pass
# Submit for writing data to the database
Conn. Commit (); C. Close ()
Return R
# Obtain a URL
Def getoneurl ():
Global conn; C = conn. cursor ()
C.exe cute ('select [curl] from [collect] Where [State] in () limit ;')
Row = C. fetchone (); C. Close ()
If ROW = none: Return ""
Return row [0]. encode ('utf-8 ')

# Update the status of a record
Def updateoneurl (URL, State ):
Global conn; C = conn. cursor ()
C.exe cute ('Update [collect] Set [State] =? Where [curl] = ?; ', (State, URL ))
Conn. Commit (); C. Close ()

# Clear unnecessary spaces in HTML code
Def clearblank (HTML ):
If Len (HTML) = 0: Return''
Html = Re. sub ('\ r | \ n | \ t', '', HTML)
While html. Find ("")! =-1 or HTML. Find ('& nbsp ;')! =-1:
Html = html. Replace ('& nbsp;', ''). Replace ('','')
Return html

# format a URL
def formaturl (HTML, URL):
URLs = Re. findall (''' (] *? Href = "([^"] +) "[^>] *?>) | (] *? Href = '([^'] +) '[^>] *?>) ''', HTML, re. i)
If URLs = none: Return HTML
for regs in URLs:
html = html. replace (regs [0], matchurl (regs [0], URL)
return HTML

# Format a single URL
Def matchurl (TAG, URL ):
URLs = Re. findall (''' (. *) (SRC | href) = (. + ?) (|/> |>). * | (. *) URL \ ([^ \)] +) \) ''', Tag, re. I)
If URLs = none:
Return tag
Else:
If URLs [0] [5] = '':
Urlquote = URLs [0] [2]
Else:
Urlquote = URLs [0] [5]

If Len (urlquote)> 0:
Curl = Re. sub (''' ['"] ''','', urlquote)
Else:
Return tag

URLs = urlparse (URL); Scheme = URLs [0];
If scheme! = '': Scheme + = '://'
Host = URLs [1]; host = scheme + host
If Len (host) = 0: Return tag
Path = OSP. dirname (URLs [2]);
If Path = '/': Path = '';
If curl. Find ("#")! =-1: curl = curl [: curl. Find ("#")]
# Judgment type
If Re. Search (''' ^ (HTTP | HTTPS | FTP) :( // | \\\\) ([\ W/\\\+ \-~ '@: %]) + \.) + ([\ W/\. \ = \? \ + \-~ '@':! % #] | (& Amp;) | &) + ''', curl, re. I )! = None:
# Skip the URL type starting with HTTP
Return tag
Elif curl [: 1] = '/':
# Absolute path
Curl = Host + curl
Elif curl [: 3] = '../':
# Relative Path
While curl [: 3] = '../':
Curl = curl [3:]
If Len (PATH)> 0:
Path = OSP. dirname (PATH)
Elif curl [: 2] = './':
Curl = Host + path + curl [1:]
Elif curl. Lower () [: 7] = 'mailto: 'or curl. Lower () [: 11] = 'javascript :':
Return tag
Else:
Curl = Host + path + '/' + curl
R = tag. Replace (urlquote, '"' + curl + '"')
Return R

# HTML code truncation Function
Def sect (HTML, start, end, CLS = ''):
If Len (HTML) = 0: return;
# Regular Expression Truncation
If start [: 1] = CHR (40) and start [-1:] = CHR (41) and end [: 1] = CHR (40) and end [-1:] = CHR (41 ):
Rehtml = Re. Search (start + '(.*?) '+ End, HTML, re. I)
If rehtml = none: Return
Rehtml = rehtml. Group ()
Intstart = Re. Search (START, rehtml, re. I). End ()
Intend = Re. Search (end, rehtml, re. I). Start ()
R = rehtml [intstart: intend]
# String Truncation
Else:
# Obtain the start string position
Intstart = html. Lower (). Find (start. Lower ())
# If the start string cannot be found, null is directly returned.
If intstart =-1: Return
# Obtain the position of the end string
Intend = html [intstart + Len (start):]. Lower (). Find (end. Lower ())
# If the end string cannot be found, the return value is null.
If intend =-1: Return
# Start and end strings are available. You can start intercepting them.
R = html [intstart + Len (start): intstart + intend + Len (start)]
# Clear content
If CLS! = '':
R = clear (R, CLS)
# Return truncated characters
Return R

# Regular Expression clearing
Def clear (HTML, regexs ):
If regexs = '': Return html
For RegEx in regexs. Split (CHR (10 )):
RegEx = RegEx. Strip ()
If RegEx! = '':
If RegEx [: 1] = CHR (40) and RegEx [-1:] = CHR (41 ):
Html = Re. sub (RegEx, '', HTML, re. I | re. s)
Else:
Html = html. Replace (RegEx ,'')
Return html

# Format as a path
Def en2path (enstr ):
Return re. sub ('[\ W] +', '-', en2chr (enstr), Re. I | re. U). Strip ('-')

# Replacing an object with a normal character
Def en2chr (enstr ):
Return enstr. Replace ('& amp ;','&')

# ------------------------------------- Start executionProgram-------------------------------------------

# First create a database
Createdatabase ()

# Start collection
Loops = 0
While true:
If loops> 0:
Url = getoneurl ()
If url = "":
Loops = 0
Else:
Loops = collect (URL)
Else:
Loops = collect ()
# Pause
Time. Sleep (sleep)
If loops = 0: Break
# Closing the HTTP Connection
DL. Close ()
# Exit the program
SYS. Exit ()

 

This article references http://www.cnblogs.com/kuyuecs/archive/2008/10/15/1311346.html

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.