废话不多说,直接上代码
from selenium import webdriver from selenium.webdriver import ChromeOptions import time import re from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait import uuid import os import requests option = ChromeOptions() option.add_argument( \'user-agent=\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36\"\' ) option.add_experimental_option(\'excludeSwitches\', [\'enable-automation\']) #防止系统检测到自动化工具 option.add_experimental_option(\'useAutomationExtension\', False) browser = webdriver.Chrome(options=option) browser.execute_cdp_cmd(\'Page.addScriptToEvaluateOnNewDocument\', { \'source\': \'Object.defineProperty(navigator, \"webdriver\", {get: () => undefined})\' }) browser.maximize_window()#页面最大化 def douyincrawler(keyword): url = \'https://www.douyin.com/search/\'+keyword+\'?publish_time=0&sort_type=0&source=switch_tab&type=video\' browser.get(url) browser.find_element_by_xpath(\'//*[@id=\"qdblhsHs\"]/button\').click() #点击登陆用抖音手机app扫码登陆 time.sleep(15) #设置等待时间扫码登陆 for x in range(5):#自动下拉 time.sleep(5) js_bottom = \"var q=document.documentElement.scrollTop=10000\" browser.execute_script(js_bottom) if \'服务出现异常\' in browser.page_source: #刷新页面 browser.refresh() if \'服务异常,重新\' in browser.page_source: browser.find_element_by_xpath(\'//*[@id=\"dark\"]/div[2]/div/div[3]/div[2]/div/div/span\').click() #点击加载 detail_url_lists = browser.find_elements_by_xpath(\'//*[@id=\"dark\"]/div[2]/div/div[3]/div[2]/ul/li/div/div/a[1]\')# 获取页面所有详情url print(\'共计侦查到{}个视频数据\'.format(len(detail_url_lists))) for i in detail_url_lists: try: browser.execute_script(\"arguments[0].click();\", i) #防止页面有该元素却无法点击问题出现 ws = browser.window_handles #获取所有窗口 browser.switch_to.window(ws[1]) #切换新句柄 WebDriverWait(browser, 10).until(EC.presence_of_element_located(( By.XPATH, \'//*[@id=\"root\"]/div/div[2]/div/div/div[1]/div[1]/div[2]/div/div[1]/div/div[2]/div[2]/xg-video-container/video\' ))) #显示等待视频标签出现 video_url = \'https:\' + re.findall(r\'<source class=\"\" src=\"(.*?)\"\', browser.page_source)[0] # 正则获取视频链接 savevideo(video_url) browser.close() #关闭当前窗口 browser.switch_to.window(ws[0]) #切回主页面这一步很关键 except Exception as e: print(e) def savevideo(video_url): headers = { \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36\", } video_dir = r\'C:\\Users\\lvye\\Desktop\\dou_yin\\video\' video_full_path = os.path.join(video_dir,str(uuid.uuid4()) + \'.mp4\') response = requests.get(url=video_url,headers=headers) with open(video_full_path,\'wb\')as f: f.write(response.content) print(\'已下载:{}\'.format(video_url)) if __name__ == \'__main__\': douyincrawler(\'街拍美女\')
成果展示:
注:该代码只做技术分享,不可用于违法犯罪
来源:https://www.cnblogs.com/lvye001/p/16054931.html
本站部分图文来源于网络,如有侵权请联系删除。