Python implementation Login to get a personal collection and save as a Word file

Source: Internet
Author: User
Tags urlencode
This procedure is actually very early completed, has not been issued, while the recent is not very busy to share to everyone.
Using the BeautifulSoup module and the Urllib2 module, and then saved to Word is using the Python docx module, installation method on the Internet a lot of search, I will not repeat.

The main implementation of the function is to login to know, and then the personal collection of questions and answers to save as a Word document, so that there is no network when it can be consulted. Of course, if there are pictures in the answer can be obtained. But this one is still a bit of a problem.

There is a regular, not too bad to use ... Despise yourself ...

And, now that's the problem, all the answers will be saved. See if you have time to save the first answer or the answer to your favorite page question. Or if the collection is too much, the saved word will scare you. O (∩_∩) o haha ~

When the login may require verification code, if prompted to enter a verification code in the program folder below you can see the verification code picture, according to the input is OK.

#-*-Coding:utf-8-*-#登陆知乎抓取个人收藏 then save as Wordimport sysreload (SYS) sys.setdefaultencoding (' utf-8 ') import Urllibimport Urllib2import cookielibimport stringimport refrom BS4 import beautifulsoupfrom docx import documentfrom docx import *from docx.shared Import inchesfrom sys import exitimport os #这儿是因为在公司上网的话需要使用socket代理 #import socks#import socket# Socks.setdefaultproxy (socks. PROXY_TYPE_SOCKS5, "127.0.0.1", 8088) #socket. Socket =socks.socksocket loginurl= ' http://www.zhihu.com/login ' headers = {' user-agent ': ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/34.0.1847.116 safari/537.36 ',} postdata={' _xsrf ': ' Acab9d276ea217226d9cc94a84a231f7 ', ' email ': ', ' Password ': ', ' rememberme ': ' Y '} If not os.path.exists (' myimg '): os.mk Dir (' myimg ') if os.path.exists (' 123.docx '): Os.remove (' 123.docx ') if os.path.exists (' Checkcode.gif '): Os.remove (' Checkcode.gif ') mydoc=document () questiontitle= "#-------------------------------------------------------------- --------def Dealimg (imgcontent): Soup=beautifulsoup (imgcontent) try:for imglink in Soup.findall (' img '): If Imglink are not none:myimg= imglink.get (' src ') #print myimg if Myimg.find (' http ') >=0:imgsrc=urllib2.ur Lopen (myimg). Read () imgnamere=re.compile (R ' http\s*/') imgname=imgnamere.sub (", myimg) #print im Gname with open (U ' myimg ' + '/' +imgname, ' WB ') as Code:code.write (IMGSRC) mydoc.add_picture (U ' myimg/' +imgname,width=inches (1.25)) Except:pass Strinfo=re.compile (R '
 
  
   [\s\s]*
  
 ') imgcontent=strinfo.sub (", Imgcontent) Strinfo=re.compile (R ') imgcontent=strinfo.sub (", imgcontent) #show all str Info=re.compile (R ') imgcontent=strinfo.sub (", Imgcontent) Strinfo=re.compile (R ') imgcontent=strinfo.sub (", imgcontent) Imgcontent=imgcontent.replace ('', ') imgcontent=imgcontent.replace ('', '). Replace ('

', '). Replace ('

', '). Replace ('

', '). Replace ('
', ') return imgcontent def enterquestionpage (pageurl): Html=urllib2.urlopen (Pageurl). Read () Soup=beautifulsoup (H tml) questiontitle=soup.title.string mydoc.add_heading (questiontitle,level=3) for Div in Soup.findall (' div ', {' class ') : ' Fixed-summary zm-editable-content Clearfix '}): #print div conent=str (div). replace (","). Replace (",") CO Nent=conent.decode (' Utf-8 ') conent=conent.replace ('
' \ n ') conent=dealimg (conent) # # #这一块弄得太复杂了 have time to find out if there is a module conent=conent.replace (', ') that handles HTML. Replace ('

', '). replace (","). Replace (","). Replace (","). Replace (', '). "(") Replace ('
', ') mydoc.add_paragraph (conent,style= ' BodyText3 ') "" "File=open (' 222.txt ', ' a ') file.write (str (conent)) file.c Lose () "" "Def Entercollectpage (pageurl): Html=urllib2.urlopen (Pageurl). Read () Soup=beautifulsoup (HTML) for Div in Soup.findall (' div ', {' class ': ' Zm-item '}): H2content=div.find (' H2 ', {' class ': ' Zm-item-title '}) #print h2content if H 2content is not none:link=h2content.find (' a ') mylink=link.get (' href ') quectionlink= ' http://www.zhihu.com ' + MyLink enterquestionpage (quectionlink) print Quectionlink def loginzhihu (): Postdatastr=urllib.urlencode ( PostData) "CJ = Cookielib. Lwpcookiejar () Cookie_support = Urllib2. Httpcookieprocessor (CJ) opener = Urllib2.build_opener (cookie_support,urllib2. HttpHandler) Urllib2.install_opener (opener) "H = Urllib2.urlopen (loginurl) request = Urllib2. Request (loginurl,postdatastr,headers) request.get_origin_req_host response = Urllib2.urlopen (Request) #print Response.geturl () Text = RESponse.read () collecturl= ' http://www.zhihu.com/collections ' Req=urllib2.urlopen (collecturl) if Str (Req.geturl ()) = = ' Http://www.zhihu.com/?next=%2Fcollections ': print ' login fail! ' Return Txt=req.read () soup=beautifulsoup (TXT) count=0 divs =soup.findall (' div ', {' class ': ' Zm-item '}) if DIVs is None: print ' Login fail! ' Return print ' login ok!\n ' for div in Divs:link=div.find (' a ') mylink=link.get (' href ') collectlink= ' http:/ /www.zhihu.com ' +mylink entercollectpage (collectlink) print Collectlink #这儿是当时做测试用的, Value gets a collection #count +=1 #if Co Unt==1: # return def getcheckcode (thehtml): Soup=beautifulsoup (thehtml) div=soup.find (' div ', {' class ': ' Js-captch A Captcha-wrap '}) if Div is not None: #print div imgsrc=div.find (' img ') imglink=imgsrc.get (' src ') if Imglink Is isn't none:imglink= ' http://www.zhihu.com ' +imglink imgcontent=urllib2.urlopen (imglink). Read () with open (' Checkcode.gif ', ' WB ') as Code:code.write (imgContent) return True Else:return false return False if __name__== ' __main__ ': Import getpass username= Raw_input (' Input username: ') password=getpass.getpass (' Enter password: ') postdata[' email ']=username postdata[' pass Word ']=password postdatastr=urllib.urlencode (postdata) CJ = Cookielib. Lwpcookiejar () Cookie_support = Urllib2. Httpcookieprocessor (CJ) opener = Urllib2.build_opener (cookie_support,urllib2. HttpHandler) Urllib2.install_opener (opener) H = urllib2.urlopen (loginurl) request = Urllib2. Request (loginurl,postdatastr,headers) response = Urllib2.urlopen (request) txt = Response.read () if Getcheckcode (TXT): Checkcode=raw_input (' Input checkcode: ') postdata[' Captcha ']=checkcode Loginzhihu () mydoc.save (' 123.docx ') Els E:loginzhihu () mydoc.save (' 123.docx ') print ' The End ' Raw_input ()

Well, that's probably the case, if you have any good suggestions or something can be the following message, I will reply as soon as possible. Or in the station about the page has my contact information, directly contact me on OK.

  • Related Article

    Contact Us

    The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

    If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.