#! /Usr/ENV Python
#-*-Coding: UTF-8 -*-
Import urllib
Import urllib2
Import random
Import requests
Import OS, sys
Import queue
Import threading
Import time
Import mysqldb
From sgmllib import sgmlparser
Import re
Queue = queue. Queue ()
Out_queue = queue. Queue ()
Num = 0
Class threadurl (threading. Thread ):
Def _ init _ (self, queue, out_queue ):
Threading. thread. _ init _ (Self)
Self. Queue = queue
Self. out_queue = out_queue
Def run (Self ):
While true:
Host = self. queue. Get ()
Print host
Try:
Html = requests. Get (host)
Result = html. Content
Html. Close ()
Self. out_queue.put (result)
# Place chunk into out queue
Except t:
Print time. Sleep (5)
# Signals to queue job is done
Self. queue. task_done ()
Class dataminethread (threading. Thread ):
Def _ init _ (self, out_queue ):
Threading. thread. _ init _ (Self)
Self. out_queue = out_queue
Def run (Self ):
While true:
Result = self. out_queue.get ()
Pattern = Re. Compile ('<Div class = "appdiscrib"> [\ s] *? <H4> (. + ?) </H4> ')
Data0 = Re. findall (pattern, result)
Pattern = Re. Compile ('version number (. + ?) </LI> ')
Data1 = Re. findall (pattern, result)
Pattern = Re. Compile ('initiator (. ++ ?) </LI> ')
Data2 = Re. findall (pattern, result)
Pattern = Re. Compile ('release time (. + ?) </LI> ')
Data3 = Re. findall (pattern, result)
Pattern = Re. Compile ('file size (. + ?) </LI> ')
Data4 = Re. findall (pattern, result)
Pattern = Re. Compile ('supports firmware (. ++ ?) </LI> ')
Data5 = Re. findall (pattern, result)
Pattern = Re. Compile ('application introduction Data6 = Re. findall (pattern, result)
For items in data6:
Pass # print re. sub ('<br/>', '', items)
SQL = "insert into address (name, version, developer, pubtime, filesize, support, introduction) values (% s, % s, % s, % s )"
For items in data6:
If (data5 ):
Values = (data0 [0], data1 [0], data2 [0], data3 [0], data4 [0], data5 [0], re. sub ('<br/>', '', items ))
Else:
Values = (data0 [0], data1 [0], data2 [0], data3 [0], data4 [0], 'null', re. sub ('<br/>', '', items ))
# Print values
# Print SQL % values
Try:
Conn = mysqldb. Connect (host = 'localhost', user = 'root', passwd = '000000', DB = 'ssssbookdb', charset = "utf8 ")
Cursor = conn. cursor ()
Cursor.exe cute (SQL, values)
Conn. Commit ()
Except t:
Print "error2"
Try:
Cursor. Close ()
Conn. Close ()
Except t:
Print "error3"
Pattern = Re. Compile ('<Div class = "apptitle Clearfix"> [\ s] *? ')
Data = Re. findall (pattern, result)
For J in data:
Print J
Global num
Try:
Temp = requests. Get (J [1:-2])
F = file ("picture/" + STR (Num), "W + ")
Num = num + 1
Print num
F. Write (temp. Content)
Except t:
Print "error4"
Self. out_queue.task_done ()
Def main ():
For k in range (1,2539 ):
Print K
Try:
Url = "http://apk.gfan.com/apps_7_1_" + STR (k) + ". html"
Html = requests. Get (URL)
Result = html. Content
Html. Close ()
Pattern = Re. Compile ('<a href = "([http://apk.gfan.com]? /Product/APP \ d00001, 80000.html )"')
Dataresult = Re. findall (pattern, result)
Dataresult = List (SET (dataresult ))
For a in range (20 ):
W = threadurl (queue, out_queue)
W. setdaemon (true)
W. Start ()
For I in dataresult:
Host = "http://apk.gfan.com" + I
Queue. Put (host)
For a in range (20 ):
Dt = dataminethread (out_queue)
DT. setdaemon (true)
DT. Start ()
Except t:
Time. Sleep (5)
Queue. Join ()
Out_queue.join ()
# SQL = "select * From Address"
#Cursor.exe cute (SQL)
# Conn. Commit ()
# Finalresult = cursor. fetchall ()
# If finalresult:
# For X in finalresult:
# Pass # print X [0:]
If _ name __= = "_ main __":
Main ()