最新文章专题视频专题问答1问答10问答100问答1000问答2000关键字专题1关键字专题50关键字专题500关键字专题1500TAG最新视频文章推荐1 推荐3 推荐5 推荐7 推荐9 推荐11 推荐13 推荐15 推荐17 推荐19 推荐21 推荐23 推荐25 推荐27 推荐29 推荐31 推荐33 推荐35 推荐37视频文章20视频文章30视频文章40视频文章50视频文章60 视频文章70视频文章80视频文章90视频文章100视频文章120视频文章140 视频2关键字专题关键字专题tag2tag3文章专题文章专题2文章索引1文章索引2文章索引3文章索引4文章索引5123456789101112131415文章专题3
当前位置: 首页 - 科技 - 知识百科 - 正文

python实现爬虫下载漫画示例

来源:动视网 责编:小采 时间:2020-11-27 14:38:45
文档

python实现爬虫下载漫画示例

python实现爬虫下载漫画示例:代码如下:#!/usr/bin/python3.2import os,socketimport urllibimport urllib.request,threading,timeimport re,sysglobal manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mu
推荐度:
导读python实现爬虫下载漫画示例:代码如下:#!/usr/bin/python3.2import os,socketimport urllibimport urllib.request,threading,timeimport re,sysglobal manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mu


代码如下:


#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2

weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6


if len(sys.argv)>=3:
weburl=sys.argv[1]
floder=sys.argv[2]
else:
print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6")
sys.exit(0)
if len(sys.argv)>=4:
chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
threadcount=(int)(sys.argv[4])

def jin(i,jinzhi):
finalans=""
answer=i%jinzhi
i=int(i/jinzhi)
if answer>9:
finalans=finalans+chr(ord('a')+(answer-10))
else:
finalans=finalans+str(answer)
if i!=0:
finalans=jin(i,jinzhi)+finalans
return finalans
def urlparse(p,a,c,k):
d={}
e=lambda c: jin(c,36)
if 1:
while c:
c=c-1
if not k[c]:
d[jin(c,36)]=jin(c,36)
else:
d[jin(c,36)]=k[c]
k=[lambda e:d[e]]
e=lambda c:'\\w+'
c=1
newstr=""
while c:
c=c-1
if k[c]:
for i in range(0,len(p)):
tempi=p[i]
tempi=ord(tempi)
if tempi>=ord('a') and tempi<=ord('f'):
newstr+=d[chr(tempi)]
elif tempi>=ord('0') and tempi<=ord('9'):
newstr+=d[chr(tempi)]
else:
newstr+=chr(tempi)
return newstr
def meispower(s):
p=re.compile(r"(?=\}\().*",re.IGNORECASE)
s=p.findall(s)
s=s[0]
s=s[0:(len(s)-19)]
par=s.split(',')
par[3]=par[3][1:len(par[3])]
answer=par[3].split('|')
chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
allurl=allurl[10:(len(allurl)-2)]
return allurl
def pictofile(weburl,filename,loop=100):
if loop<0:
print('can\'t download the picture %s'%weburl)
return
loop=loop-1
if os.path.exists(filename):
return
try:
url=urllib.request.urlopen(weburl)
data=url.read()
if len(data)<2048:
url.close()
pictofile(weburl,filename,loop)
else:
print('download from %s name is %s\n'%(weburl,filename))
myfile=open('%s'%filename,'wb')
myfile.write(data)
myfile.close()
url.close();
except socket.timeout:
print('timeout')
pictofile(weburl,filename,loop)
except Exception as e:
print('error',e)
pictofile(weburl,filename,loop)
finally:
pass
def downloadpic(url,loadpicdir,num):
#download the all url picture to loadpicdir
global currentthreadnum,mutex,mutex2
mymode=re.compile(r'[0-9a-z.]*\Z')
try:
mutex2.acquire()
os.chdir(loadpicdir)
mutex2.release()
except:
print("can't open the floder %s will be create"%loadpicdir)
try:
if(mutex2.locked()):
os.mkdir(loadpicdir)
os.chdir(loadpicdir)
mutex2.release()
print('create floder succeed')
except:
print("can't create floder %s"%loadpicdir)
if(mutex.acquire()):
mutex.release()
quit(0)
name=mymode.findall(url)
filename='manhua'+name[0]
pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
mutex.acquire()
currentthreadnum=currentthreadnum-1
mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
global manhuaweb,threadcount,currentthreadnum,mutex
print(manhuaweb+url)
webdata=urllib.request.urlopen(manhuaweb+url).read()
webdata=webdata.decode('UTF-8')
chaptername=re.findall(r'[^_]*',webdata)[0]<BR> chaptername=chaptername[7:len(chaptername)]<BR> webscrip=re.findall(r'eval.*[^<>]',webdata)<BR> chapterurl=meispower(webscrip[0]);<BR> chapterurl='http://mhimg.ali213.net'+chapterurl<BR> for i in range(begin,num):<BR> try:<BR> while(currentthreadnum>=threadcount):<BR> time.sleep(0.5)<BR> mutex.acquire()<BR> currentthreadnum=currentthreadnum+1<BR> mutex.release()<BR> threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()<BR> except socket.error:<BR> mutex.acquire()<BR> i=i-1<BR> currentthreadnum=currentthreadnum-1<BR> mutex.release()<BR> except Exception as error:<BR> print(error,'break')<BR> print('download chapter %d of picture make a error'%i)<BR> break<BR>if __name__=='__main__':<BR> manhuaweb=r'http://manhua.ali213.net'<BR> socket.setdefaulttimeout(60.0)<BR> mutex=threading.Lock()<BR> mutex2=threading.Lock()</P> <P> <BR> webfile=urllib.request.urlopen(weburl)<BR> webdata=webfile.read();<BR> webdata=webdata.decode('UTF-8')<BR> meshmode=re.compile(r'.*')<BR> meshdata=meshmode.findall(webdata)[0]<BR> indexmode=re.compile(r'([0-9]*页)')<BR> indexdata=indexmode.findall(meshdata)</P> <P> picurlmode=re.compile(r'/comic/[0-9/]*.html')<BR> picurldata=picurlmode.findall(meshdata)</P> <P><BR> chapterlength=len(picurldata)<BR> nummode=re.compile(r'[\d]+')</P> <P> i=chapterbegin<BR> while i<chapterlength:<BR> manhuachapter=picurldata[chapterlength-i-1]<BR> downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))<BR> i=i+1 </p><p><script type="text/javascript" src="https://jss.51dongshi.net/pcwz/dysph.js"></script></div> <div class="downbox clearfix"> <div class="ico"><img src="https://js.51dongshi.net/tpl/pc2/images/document.png" alt="文档"></div> <div class="txt"> <h4>python实现爬虫下载漫画示例</h4> <div class="co">python实现爬虫下载漫画示例:代码如下:#!/usr/bin/python3.2import os,socketimport urllibimport urllib.request,threading,timeimport re,sysglobal manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mu</div> <div class="tj"><b>推荐度:</b><div class="tj_stars_list"><i class="tj_stars tj_05"></i></div></div> </div> <div class="btn"> <a href="javascript:;" class="html2word" data-model="article" data-id="cadbah" title="文档下载"> <b>点击下载本文</b> <span>文档为doc格式</span> </a> </div> </div> <script>keji_detail_ga('article_content_bottom');</script> <div class="tvideo_tag"> 标签: <a href="/tlist-10872/">漫画</a> <a href="/tlist-136697/">manhua</a> <a href="/tlist-283385/">示例</a> </div> <div class="tvideo_box tvideo_box_tab"> <div class="hd"> <ul><li class="on">热门焦点</li><script>keji_detail_ga('hot_jiaodian_tab_title');</script></ul> </div> <div class="bd"> <ul class="ult_jpic clearfix"> <script>get_hot_jiaodian_content('wz_hot_jiaodian',33,'hot',1);</script> </ul> </div> </div> </div> <!--中间文章 end--> <!--右侧专题 star--> <div class="ult_zt"> <script>keji_detail_ga('article_right_top');</script> <div class="ult_ztbox"> <h4>最新推荐</h4> <ul class="tvideo_r_a clearfix"> <script>get_detail_right('wz_right_new',33,'')</script> </ul> </div> <script>keji_detail_ga('article_right_middle');</script> <div class="ult_ztbox"> <h4>猜你喜欢</h4> <ul class="telist_rb clearfix"> <script>get_detail_right('wz_right_love',33,'')</script> </ul> </div> <script>keji_detail_ga('article_right_new_bottom');</script> <div class="ult_ztbox"> <h4>热门推荐</h4> <ul class="tvideo_r_b clearfix"> <script>get_detail_right('wz_right_hot',33,'')</script> </ul> </div> <script>keji_detail_ga('article_right_bottom');</script> </div> <!--右侧专题 end--> <div style="display:none"><a href="https://www.51dongshi.net/wzztf/ca/cadbah/">专题</a> </div> </div> </div> <script src="https://js.51dongshi.net/plug/qrcode/qrcode.min.js"></script> <script src="https://js.51dongshi.net/js/share.js" charset="UTF-8"></script> <script type="text/javascript" src="https://js.51dongshi.net/tpl/pc2/js/waypoints.min.js"></script> <script type="text/javascript" src="https://js.51dongshi.net/tpl/pc2/js/icon_step.js"></script> <style> #qrcode{padding: 15px;background: #fff;} </style> <div id="qrcode" style="display: none"></div> <script>new QRCode(document.getElementById("qrcode"), "https://m.51dongshi.net/eedfcadbah.html");</script> <iframe src="https://hits.51dongshi.net/?biao=ho_article&id=892695" border="0" frameborder="0" style="width: 0px; height: 0px"></iframe> <script src="https://js.51dongshi.net/js/shHighlighter.js"></script> <link rel="stylesheet" href="https://jss.51dongshi.net/js/fffz/css/box.css"> <script id="fffz" data-domain="//www.51dongshi.net/index" src="https://js.51dongshi.net/js/fffz/fffz.js"></script> <script type="text/javascript" src="https://jss.51dongshi.net/pcwz/tj.js"></script> <div style="display:none"><script> var _hmt = _hmt || []; (function() { var hm = document.createElement("script"); hm.src = "https://hm.baidu.com/hm.js?4b1ebb0298b66c8a109db070c4878833"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(hm, s); })(); </script> </div> <!--foot star--> <div class="footer"> <div class="m_box"> <div class="footer_in"> <a href="#" target="_blank">产品服务</a> <a href="#" target="_blank">发展历程</a> <a href="#" target="_blank">企业资讯</a> <a href="#" target="_blank">企业文化</a> <a href="#" target="_blank">关于我们</a> <a href="#" target="_blank">加入我们</a> <a href="#" target="_blank">联系我们</a> <a href="#" target="_blank">网站导航</a> <a href="#" target="_blank">网站律师</a> </div> <ul class="ftrlist"> <li> <a rel="nofollow" href="https://www.itrust.org.cn" target="_blank"> <img src="https://js.51dongshi.net/tpl/pc2/images/footer_logo01.gif" alt="中国互联网协会"></a> </li> <li> <a rel="nofollow" href="https://www.12377.cn/" target="_blank"> <img src="https://js.51dongshi.net/tpl/pc2/images/12377logo.png" alt="中国互联网举报中心"></a> </li> <li> <a rel="nofollow" href="http://www.cyberpolice.cn/" target="_blank"> <img src="https://js.51dongshi.net/tpl/pc2/images/footer_logo05.gif" alt="网络110报警服务"></a> </li> <li> <a rel="nofollow" href="http://www.creditchina.gov.cn/" target="_blank"> <img src="https://js.51dongshi.net/tpl/pc2/images/creditchina.gif" alt="信用中国"></a> </li> <li class="last"> <a rel="nofollow" href="http://www.shdf.gov.cn/shdf/channels/740.html" target="_blank"> <img src="https://js.51dongshi.net/tpl/pc2/images/footer_logo11.png" alt="中国扫黄打非网"><p>中国扫黄打非网</p> </a> </li> </ul> <div class="footer_co"> <a href="/" class="footlogo"><img src="https://js.51dongshi.net/tpl/pc2/images/logo.png" alt="动视"></a> <p>Copyright © 2019-2025 <a href="/" target="_blank">51dongshi.net</a> 版权所有</p> <p> <a rel="nofollow" target="_blank" href="https://beian.miit.gov.cn/">赣ICP备2023002352号-34</a> </p> <p>违法及侵权请联系:TEL:177 7030 7066 E-MAIL:11247931@qq.com 本站由北京市万商天勤律师事务所王兴未律师提供法律服务</p> </div> </div> </div> <!--foot end--> <a href="#0" class="cd-top">Top</a> <script type="text/javascript"> $(function(){ var _line=parseInt($(window).height()/3); $(window).scroll(function(){ if ($(window).scrollTop()>100) { $('.edu_top').css({'position':'fixed','top':'0','z-index':'99'}) }else{ $('.edu_top').css({'position':'relative','top':'0'}) } }) }) </script> <script type="text/javascript" src="https://jss.51dongshi.net/ga/all.js"></script> </body> </html>