Python2.7 writes the specified elements in the read HTML and generates excle files
#coding =GBK
Import string
Import Codecs
Import Os,time
Import XLWT
Import xlrd
From BS4 import BeautifulSoup
From XLRD import Open_workbook
Class Logmsg:
def __init__ (self,logfile,level=0):
Try
Import logging
#self. Logger = None
Self.logger = Logging.getlogger ()
SELF.HDLR = logging. Filehandler (logfile)
Formatter = logging. Formatter ("[% (Asctime) s]:% (message) S", "%y%m%d%h:%m:%s")
Self.hdlr.setFormatter (Formatter)
Self.logger.addHandler (SELF.HDLR)
#logger. Setlevel ()
If level = 10:
Self.logger.setLevel (logging. DEBUG)
Elif level = 20:
Self.logger.setLevel (Logging.info)
Elif level = 30:
Self.logger.setLevel (logging. WARNING)
Elif level = 40:
Self.logger.setLevel (logging. ERROR)
Elif level = 50:
Self.logger.setLevel (logging. CRITICAL)
Else
Self.logger.setLevel (logging. NOTSET)
Except
Print "Log init error!"
Exit (1)
def output (Self,loginfo):
Level = Self.logger.getEffectiveLevel ()
Try
If level = 10:
Self.logger.debug (Loginfo)
Elif level = 20:
Self.logger.info (Loginfo)
Elif level = 30:
Self.logger.warning (Loginfo)
Elif level = 40:
Self.logger.error (Loginfo)
Elif level = 50:
Self.logger.critical (Loginfo)
Else
Self.logger.info (Loginfo)
Except
Print "Log Output error!"
Exit (1)
def close (self):
Try
#logging. Shutdown ([SELF.HDLR])
Self.logger.removeHandler (SELF.HDLR)
Except
Print "Log closed error!"
Exit (1)
LogTime = Time.strftime ("%y%m%d%h%m%s", Time.localtime ())
Logfiletime = Time.strftime ("%y%m%d", Time.localtime ())
Logfile = '/data/pyexample/logs/htmlparser_%s.log '% logfiletime
Log = Logmsg (logfile,20)
DataPath = '/data/pyexample/'
Xlsname = ' dangjian_ ' +logtime+ '. xls '
if __name__ = = ' __main__ ':
WBK = XLWT. Workbook (encoding = ' GBK ')
Sheet = wbk.add_sheet (' Basic Content import template ')
Sheet.write (0,0, ' content type ')
Sheet.write (0,1, ' column name ')
Sheet.write (0,2, ' column number ')
Sheet.write (0,3, ' content name ')
Sheet.write (0,4, ' time Long ')
Sheet.write (0,5, ' key word ')
Sheet.write (0,6, ' watch ')
Sheet.write (0,7, ' author ')
Sheet.write (0,8, ' source ')
Sheet.write (0,9, ' sub content 1 ')
Sheet.write (0,10, ' sub content 2 ')
Xlscontent = []
Files = Os.listdir (datapath)
K = 0
For f in Files:
If Os.path.splitext (f) [1] = = '. html ':
Content=[]
Log.output (' Current file: ' +f ')
Htmlfile =codecs.open (datapath+f, ' R ', ' GBK ')
lines = Htmlfile.readlines ()
If not lines:
Log.output (' not line ')
For line in lines:
If Line.strip () = = ' \ n ':
Log.output (' The place is empty line ')
Else
line = Line.replace (', ')
Soup = BeautifulSoup (line)
For TDD in Soup.findall (' TD '):
#print Tdd.text.encode ("GBK")
Content.append (Tdd.text.encode ("GBK"))
#print line.encode (' GBK ')
Htmlfile.close ()
For I in content:
Print Content.index (i), ', ', I
Log.output (i)
Log.output (Content.index (i))
print '----------------------------------------'
FolderName = Content[6]
Contentname= Content[4]
Duration = Filter (Str.isdigit, content[16])
Int_duration = String.atoi (duration) *60
Str_duration = "%i"%int_duration
KeyWord = Content[6]
Desciption = content[36]
Videoname_1 = content[10]
Print FolderName
Print ContentName
Print Str_duration
Print KeyWord
Print Desciption
Print Videoname_1
Log.output (' Output XLS data: ' + ', ' +foldername+ ',, ' +contentname+ ', ' +str_duration+ ', ' +keyword+ ', ' +desciption+ ', admin, number editor, ' +videoname_1+ ',,, ')
Print K
Sheet.write (k+1,0, "")
Sheet.write (K+1,1,foldername)
Sheet.write (k+1,2, "")
Sheet.write (K+1,3,contentname)
Sheet.write (k+1,4,str_duration)
Sheet.write (K+1,5,keyword)
Sheet.write (k+1,6,desciption)
Sheet.write (k+1,7, ' admin ')
Sheet.write (k+1,8, ' Hua Number editor ')
Sheet.write (K+1,9,videoname_1)
Sheet.write (k+1,10, "")
K+=1
Wbk.save (DataPath + xlsname)
print ' ========================================= '