初次接触python和爬虫,就写了一个,效率比较低(而且不知道为什么会爬取到相同内容的东西,有些内容下载不成功)
求大神指点改进
求大神指点改进
求大神指点改进
仅供学习交流与娱乐哈~
from nturl2path import pathname2url
import requests
import os
import time
import re
def send_requests(url_req):#网页申请模块
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
response = requests.get(url=url_req, headers=headers)
response.encoding = 'utf-8'
response.close()
return response.text
def find_url(url_resp):#第一个找,找到每一个页面
obj = re.compile(r' <a class="media-content" target="_blank" href="(?P<url>.*?)" title="(?P<title>.*?)"',re.S)
result = obj.findall(url_resp)
return(result)
def find_img(url_resp_son):
obj = re.compile(r'<a rel="nofollow" href="(?P<img>.*?)" alt="(?P<title>.*?)"',re.S)
result_img = obj.findall(url_resp_son)
return(result_img)
def dr_img(url_img,path,title):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
pathname = path +"\\" + title
if not os.path.exists(pathname):
img_resp = requests.get(url = url_img , headers=headers)
img_resp.content
img_resp.close()
try:
with open(path +"\\" + title,mode='wb') as f:
f.write(img_resp.content)
print(title + ' over!')
except TypeError:
print("TypeError")
else:
print(f"{title} has exist!")
def creat_main_file(filename):
path = r"D:\vs\爬虫\美图爬虫\vmgirls"
try:
os.makedirs(path + "\\" + filename )
print("已创建文件夹")
except FileExistsError:
print("已经有该文件夹")
def creat_file(file_name_son):
try:
path = r"D:\vs\爬虫\美图爬虫\vmgirls\pure"
os.makedirs(path + "\\" + file_name_son)
except FileExistsError:
print("已有该文件夹")
def create_son_file(file_son_son_name,i):
try:
path = rf"D:\vs\爬虫\美图爬虫\vmgirls\pure\第{i}页"
os.makedirs(path + "\\" + file_son_son_name)
print(f"{file_son_son_name}已创建")
except FileExistsError:
print("已有该文件夹")
creat_main_file("pure")
for i in range(1,98):
creat_file(f"第{i}页")
url = "https://www.vmgirls.com/pure/page/" + str(i) +"/"
resp = send_requests(url)#最原始的网页
url_sonss = find_url(resp)#经过挑选混为一坛的网页
print(url_sonss)
for url_sons in url_sonss:
print (url_sons)#url_sons是网页+标题
url_son_t = url_sons[0]#子网页
url_title = url_sons[1]#子网页标题
print(url_son_t)
print(url_title)
resp_son = send_requests(url_son_t)#子网页源码
result_imgs = find_img(resp_son)#子网页图片混为一谈
create_son_file(url_title,i)
for url_img in result_imgs:#每一张图片网页
img_url = url_img[0]
print(img_url)
last_name = img_url.split("/")[-1]
dr_img(img_url,rf"D:\vs\爬虫\美图爬虫\vmgirls\pure\第{i}页\{url_title}",last_name)
time.sleep(2)
time.sleep(2)
time.sleep(2)