目标是 http://www.5442.com/meinv/

如需在非linux端使用请对代码中路径符号进行更改

捧上狗屎代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#-*- coding:utf-8 -*-
import re
import urllib
import urllib2
import os
import chardet
import sys
'''
def get_html(url):#正常获取网页内容
try:
request = urllib2.Request(url,headers=ua_headers)
response = urllib2.urlopen(request)
html = response.read()
return html
except:
print "获取内容失败"
'''
def get_html(url):#转码获取网页内容
try:
request = urllib2.Request(url,headers=ua_headers)
data = urllib2.urlopen(request).read()
typeEncode = sys.getfilesystemencoding()
infoencode = chardet.detect(data).get('encoding','gb2312')#根据网站编码来更改此行中的”gb2312“
html = data.decode(infoencode,'ignore').encode(typeEncode)
return html
except:
print "获取内容失败"

ua_headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36',
'Cookie': 'AspxAutoDetectCookieSupport=1'
}

url = "http://www.5442.com/meinv/"
########################################################
#先获取meinv页面内的所有tag链接。然后利用tag的名字创建相应的目录。
########################################################
tag_code = get_html(url)
tag_egrep = r'href="(.*).*" class="'
tag_url_list = re.findall(tag_egrep, tag_code)
print
print "[V]已成功爬去meinv页面内所有tag分类的链接"
print tag_url_list##打印meinv页面的所有链接
for tag_url in tag_url_list:
try:
tag_dir_name = tag_url[24:-5]
#tag_mkpath = "C:\\Users\\Administrator\\Desktop\\Python-learn\\Photos\\" + tag_dir_name
tag_mkpath = "Photos/" + tag_dir_name
tag_mkdir = os.path.exists(tag_mkpath)
print
print "...已成功匹配到该Tag的名称:" + tag_dir_name
if not tag_mkdir:
os.makedirs(tag_mkpath)
print "...创建%s目录成功----"%tag_dir_name
else:
print "...已有此%s目录----"%tag_dir_name
except:
print "...[X]获取%s链接失败或创建%s文件夹失败[X]"%tag_dir_name
##################################
#然后使用拿到的tag链接获取所有tz的链接。
##################################
try:
tz_code = get_html(tag_url)
tz_url_egrep = r'href="(.*).*" target="_blank" title="'
tz_url_list = re.findall(tz_url_egrep,tz_code)
print tz_url_list
for tz_url in tz_url_list:

print ".........当前帖子链接---"+tz_url
try:
xz_dir = tag_mkpath + ".html"
urllib.urlretrieve(tag_url,xz_dir)
#tz_name_egrep = r'_blank" title="(.*?)">'
tz_name_egrep = r"<img alt='(.*?)' src"
tz_name_list = re.findall(tz_name_egrep, tz_code)
print tz_name_list
t=0
###############################################
#然后使用拿到的tag链接获取所有tz的名字。并创建相应的目录
###############################################
for x_tz_name in tz_name_list:
print ".........已成功匹配到"+x_tz_name
tz_mkpath = tag_mkpath + "/" + x_tz_name
tz_mkdir = os.path.exists(tz_mkpath)
if not tz_mkdir:
os.makedirs(tz_mkpath)
print ".........创建%s目录成功"%x_tz_name
else:
print ".........已有%s此目录"%x_tz_name
###############################################
#然后使用拿到的tag链接获取所有tz的链接。并创建相应的目录
###############################################
xx = 0
while True :
try:
ttz_url = tz_url_list[t]#手动循环每个帖子
###########################
#添加每个帖子内的第N个页面的连接
###########################
if xx == 0:
tz_HQ_url = ttz_url
else:
tz_hz_url = ttz_url[-5:]
tz_qz_url = ttz_url[:-5]+"_"
tz_HQ_url = tz_qz_url + str(xx) + tz_hz_url
print "-------------------------------------------"+tz_HQ_url
#######################
#获取当前页面的所有图片连接
#######################
img_code = get_html(tz_HQ_url)
img_url_egrep = r"src='(.*).*' alt=''"
img_url_list = re.findall(img_url_egrep,img_code)
img = img_url_list[0]
try:
print "............已成功爬取到%s内所有图片的链接"% x_tz_name
print "............[所有图片的链接]"
print img_url_list
print "............%s中第%s个图片链接:%s"%(x_tz_name,xx,img)
img_name = tag_mkpath + "/"+x_tz_name +"/"+ img[-15:]
urllib.urlretrieve(img,img_name)#下载图片
print "...............已成功下载图片:"+img_name
print "========================================================="
print "========================================================="
print
except:
print "[X]下载图片出错!"
print "========================================================="
print "========================================================="
print
xx = xx + 1
except:
print "while false"
break
t=t+2
except:
print "爬取%s中图片链接失败!"%x_tz_name
##########################################################################
#判断当前循环执行后是否创建对应的文件夹,如果有则结束循环直接进行下一个tag标签页面的爬取
##########################################################################
if os.access(str(xz_dir), os.F_OK):
break
else:
pass
#渣渣代码不足为外人道也
except:
print "爬取%s中帖子失败"%tag_dir_name

捧上低帧GIF

1