目标是 http://www.5442.com/meinv/
如需在非linux端使用请对代码中路径符号进行更改
捧上狗屎代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
| import re import urllib import urllib2 import os import chardet import sys ''' def get_html(url):#正常获取网页内容 try: request = urllib2.Request(url,headers=ua_headers) response = urllib2.urlopen(request) html = response.read() return html except: print "获取内容失败" ''' def get_html(url): try: request = urllib2.Request(url,headers=ua_headers) data = urllib2.urlopen(request).read() typeEncode = sys.getfilesystemencoding() infoencode = chardet.detect(data).get('encoding','gb2312') html = data.decode(infoencode,'ignore').encode(typeEncode) return html except: print "获取内容失败" ua_headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36', 'Cookie': 'AspxAutoDetectCookieSupport=1' }
url = "http://www.5442.com/meinv/"
tag_code = get_html(url) tag_egrep = r'href="(.*).*" class="' tag_url_list = re.findall(tag_egrep, tag_code) print print "[V]已成功爬去meinv页面内所有tag分类的链接" print tag_url_list for tag_url in tag_url_list: try: tag_dir_name = tag_url[24:-5] tag_mkpath = "Photos/" + tag_dir_name tag_mkdir = os.path.exists(tag_mkpath) print print "...已成功匹配到该Tag的名称:" + tag_dir_name if not tag_mkdir: os.makedirs(tag_mkpath) print "...创建%s目录成功----"%tag_dir_name else: print "...已有此%s目录----"%tag_dir_name except: print "...[X]获取%s链接失败或创建%s文件夹失败[X]"%tag_dir_name try: tz_code = get_html(tag_url) tz_url_egrep = r'href="(.*).*" target="_blank" title="' tz_url_list = re.findall(tz_url_egrep,tz_code) print tz_url_list for tz_url in tz_url_list: print ".........当前帖子链接---"+tz_url try: xz_dir = tag_mkpath + ".html" urllib.urlretrieve(tag_url,xz_dir) tz_name_egrep = r"<img alt='(.*?)' src" tz_name_list = re.findall(tz_name_egrep, tz_code) print tz_name_list t=0 for x_tz_name in tz_name_list: print ".........已成功匹配到"+x_tz_name tz_mkpath = tag_mkpath + "/" + x_tz_name tz_mkdir = os.path.exists(tz_mkpath) if not tz_mkdir: os.makedirs(tz_mkpath) print ".........创建%s目录成功"%x_tz_name else: print ".........已有%s此目录"%x_tz_name xx = 0 while True : try: ttz_url = tz_url_list[t] if xx == 0: tz_HQ_url = ttz_url else: tz_hz_url = ttz_url[-5:] tz_qz_url = ttz_url[:-5]+"_" tz_HQ_url = tz_qz_url + str(xx) + tz_hz_url print "-------------------------------------------"+tz_HQ_url img_code = get_html(tz_HQ_url) img_url_egrep = r"src='(.*).*' alt=''" img_url_list = re.findall(img_url_egrep,img_code) img = img_url_list[0] try: print "............已成功爬取到%s内所有图片的链接"% x_tz_name print "............[所有图片的链接]" print img_url_list print "............%s中第%s个图片链接:%s"%(x_tz_name,xx,img) img_name = tag_mkpath + "/"+x_tz_name +"/"+ img[-15:] urllib.urlretrieve(img,img_name) print "...............已成功下载图片:"+img_name print "=========================================================" print "=========================================================" print except: print "[X]下载图片出错!" print "=========================================================" print "=========================================================" print xx = xx + 1 except: print "while false" break t=t+2 except: print "爬取%s中图片链接失败!"%x_tz_name if os.access(str(xz_dir), os.F_OK): break else: pass except: print "爬取%s中帖子失败"%tag_dir_name
|
捧上低帧GIF