百木园-与人分享,
就是让自己快乐。

PYTHON爬取图片

from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process, Queue
import requests
from lxml import etree
from urllib import parse

# 异常处理还未优化,后续补上
# 未解决问题1:这是爬取多个页面的当前所有图片,图片内部的还未处理
# 未解决问题2:当爬取页面过多时,会报错,原因还未找到,后续补上

headers = {
\"User-Agent\": \"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Mobile Safari/537.36\",
# 防盗链 : 朔源,当前本次请求的上一级是谁
\"Referer\": \"https://xxx\"
}


def get_img_src(q):
urls = []
for i in range(1, 5):
if i == 1:
a = f\"https://xxx/index.html\"
else:
a = f\"https://xxx/{i}.html\"
urls.append(a)
href_list_all = []
for i in urls:
resp = requests.get(i, headers=headers)
resp.encoding = \'utf-8\'
tree = etree.HTML(resp.text)
href_list = tree.xpath(\"//div[@class=\'list-box-p\']/ul/li/a/@href\")
href_list_all.append(href_list)

for all_list in href_list_all:
for href in all_list:
child_resp = requests.get(href, headers=headers)
child_resp.encoding = \'utf-8\'
child_tree = etree.HTML(child_resp.text)
src = child_tree.xpath(\"//div[@class=\'img_box\']/a/img/@src\")[0] # 注意这里获取的是列表,需要取里面的下标为0的第一个元素值
q.put(src) # 循环向队列里装东西,后面好给下载用
print(f\"---------------------------------------------------被塞进队列--------------------->{src}\")
q.put(\"完事了\")


def download(src):
print(\'开始下载------------>\', src)
name = src.split(\'/\')[-1]
with open(\"./image/\" + name, mode=\'wb\') as f:
resp = requests.get(src, headers=headers)
f.write(resp.content)
print(\'下载完毕------------>\', src)


def download_img(q):
with ThreadPoolExecutor(5) as t:
while 1:
src = q.get() # 从队列里拿东西,如果没数据就阻塞,一直等着有数据来
if src == \"完事了\":
break
t.submit(download, src)


if __name__ == \'__main__\':
q = Queue()
p1 = Process(target=get_img_src, args=(q,))
p2 = Process(target=download_img, args=(q,))
p1.start()
p2.start()

来源:https://www.cnblogs.com/mingdeng3000/p/16972305.html
本站部分图文来源于网络,如有侵权请联系删除。

未经允许不得转载:百木园 » PYTHON爬取图片

相关推荐

  • 暂无文章