
','')
mydoc.add_paragraph(conent,style='BodyText3')
"""file=open('222.txt','a')
file.write(str(conent))
file.close()"""
def entercollectpage(pageurl):
html=urllib2.urlopen(pageurl).read()
soup=BeautifulSoup(html)
for div in soup.findAll('div',{'class':'zm-item'}):
h2content=div.find('h2',{'class':'zm-item-title'})
#print h2content
if h2content is not None:
link=h2content.find('a')
mylink=link.get('href')
quectionlink='http://www.zhihu.com'+mylink
enterquestionpage(quectionlink)
print quectionlink
def loginzhihu():
postdatastr=urllib.urlencode(postdata)
'''
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support,urllib2.HTTPHandler)
urllib2.install_opener(opener)
'''
h = urllib2.urlopen(loginurl)
request = urllib2.Request(loginurl,postdatastr,headers)
request.get_origin_req_host
response = urllib2.urlopen(request)
#print response.geturl()
text = response.read()
collecturl='http://www.zhihu.com/collections'
req=urllib2.urlopen(collecturl)
if str(req.geturl())=='http://www.zhihu.com/?next=%2Fcollections':
print 'login fail!'
return
txt=req.read()
soup=BeautifulSoup(txt)
count=0
divs =soup.findAll('div',{'class':'zm-item'})
if divs is None:
print 'login fail!'
return
print 'login ok!
'
for div in divs:
link=div.find('a')
mylink=link.get('href')
collectlink='http://www.zhihu.com'+mylink
entercollectpage(collectlink)
print collectlink
#这儿是当时做测试用的,值获取一个收藏
#count+=1
#if count==1:
# return
def getcheckcode(thehtml):
soup=BeautifulSoup(thehtml)
div=soup.find('div',{'class':'js-captcha captcha-wrap'})
if div is not None:
#print div
imgsrc=div.find('img')
imglink=imgsrc.get('src')
if imglink is not None:
imglink='http://www.zhihu.com'+imglink
imgcontent=urllib2.urlopen(imglink).read()
with open('checkcode.gif','wb') as code:
code.write(imgcontent)
return True
else:
return False
return False
if __name__=='__main__':
import getpass
username=raw_input('input username:')
password=getpass.getpass('Enter password: ')
postdata['email']=username
postdata['password']=password
postdatastr=urllib.urlencode(postdata)
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support,urllib2.HTTPHandler)
urllib2.install_opener(opener)
h = urllib2.urlopen(loginurl)
request = urllib2.Request(loginurl,postdatastr,headers)
response = urllib2.urlopen(request)
txt = response.read()
if getcheckcode(txt):
checkcode=raw_input('input checkcode:')
postdata['captcha']=checkcode
loginzhihu()
mydoc.save('123.docx')
else:
loginzhihu()
mydoc.save('123.docx')
print 'the end'
raw_input()
好了,大概就是这样,大家如果有什么好的建议或者什么的可以再下面留言,我会尽快回复的.或者在小站的关于页面有我的联系方式,直接联系我就ok.