| # Coding: UTF-8 Import urllib Domain = 'HTTP: // www.liaoxuefeng.com '# Liao Xuefeng domain Name Path = r'c: \ Users \ cyhhao2013 \ Desktop \ temp \ '# path to be saved in html # An html header file Input = open (r 'C: \ Users \ cyhhao2013 \ Desktop \ 0.html ', 'R ') Head = input. read () # Open the main interface of the python tutorial F = urllib. urlopen ("http://www.liaoxuefeng.com/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000 ") Home = f. read () F. close () # Replace all spaces and press enter (this makes it easy to get the url) Geturl = home. replace ("\ n ","") Geturl = geturl. replace ("","") # Obtain the string containing the url List = geturl. split (r'em; "> <ahref =" ') [1:] # Obsessive-compulsive disorder. You must add the first page to make it perfect. List. insert (0, '/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000 "> ') # Start traversing the url List For li in list: Url = li. split (R' "> ') [0] Url = domain + url # patchwork url Print url F = urllib. urlopen (url) Html = f. read () # Obtain the title to write the file name Title = html. split ("<title>") [1] Title = title. split ("-liao Xuefeng's official website </title>") [0] # I need to re-enter the code, otherwise it will be a tragedy to add it to the path. Title = title. decode ('utf-8'). replace ("/","") # Intercepting text Html = html. split (R' <! -- Block main --> ') [1] Html = html. split (R' Html = html. replace (r 'src = "', 'src ="' + domain) # Add the header and tail to form a complete html Html = head + html + "</body> # Output file Output = open (path + "% d" % list. index (li) + title + '.html ', 'w ') Output. write (html) Output. close () |