Files
toutiao-pachong/douyin_video_crawler.py
2026-02-07 22:23:36 +08:00

304 lines
9.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
import os
import requests
import sys
from DrissionPage import ChromiumOptions, ChromiumPage
class VideoDownloader:
"""视频下载器类"""
def __init__(self, video_dir):
"""
初始化视频下载器
Args:
video_dir: 视频保存目录
"""
self.video_dir = video_dir
def save_video(self, video_url, aweme_id, video_title):
"""
下载视频到本地
Args:
video_url: 视频下载地址
aweme_id: 视频的唯一标识符
video_title: 视频标题
Returns:
bool: 下载是否成功
"""
try:
# 清理视频标题,移除非法字符
def clean_filename(filename):
invalid_chars = '<>:"/\\|?*'
for char in invalid_chars:
filename = filename.replace(char, '_')
return filename[:50] # 限制文件名长度
cleaned_title = clean_filename(video_title)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36...',
'Referer': 'https://www.douyin.com/'
}
print(f"开始下载视频: {cleaned_title}")
print(f"视频ID: {aweme_id}")
print(f"下载地址: {video_url}")
response = requests.get(video_url, headers=headers, stream=True, timeout=30)
if response.status_code == 200:
video_path = os.path.join(self.video_dir, f'{cleaned_title}_{aweme_id}.mp4')
print(f"保存路径: {video_path}")
total_size = int(response.headers.get('content-length', 0))
downloaded_size = 0
with open(video_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024*1024):
if chunk:
f.write(chunk)
downloaded_size += len(chunk)
if total_size > 0:
progress = (downloaded_size / total_size) * 100
print(f"下载进度: {progress:.1f}%", end='\r')
print(f"\n视频下载完成: {cleaned_title}")
wait_time = time.time() % 2 + 1
print(f"等待 {wait_time:.1f} 秒后继续...")
time.sleep(wait_time)
return True
else:
print(f"下载失败,状态码: {response.status_code}")
return False
except Exception as e:
print(f"下载视频时出错: {str(e)}")
return False
class VideoInfoExtractor:
"""视频信息提取器类"""
def __init__(self, video_dir):
"""
初始化视频信息提取器
Args:
video_dir: 视频保存目录
"""
self.downloader = VideoDownloader(video_dir)
def save_video_info(self, video_data):
"""
提取视频信息并下载视频
Args:
video_data: 包含视频信息的字典
Returns:
dict: 提取的视频信息字典
"""
print("\n开始提取视频信息...")
minutes = video_data['video']['duration'] // 1000 // 60
seconds = video_data['video']['duration'] // 1000 % 60
video_url = video_data['video']['play_addr']['url_list'][0].replace('playwm', 'play')
video_title = video_data['desc'].strip().replace('\n', '')
aweme_id = video_data['aweme_id']
print(f"视频标题: {video_title}")
print(f"视频时长: {minutes}:{seconds:02d}")
print(f"作者: {video_data['author']['nickname'].strip()}")
print(f"粉丝数: {video_data['author']['follower_count']}")
print(f"点赞数: {video_data['statistics']['digg_count']}")
video_dict = {
'用户名': video_data['author']['nickname'].strip(),
'用户uid': 'a' + str(video_data['author']['uid']),
'粉丝数量': video_data['author']['follower_count'],
'视频描述': video_title,
'视频标题': video_title,
'点赞数量': video_data['statistics']['digg_count'],
'视频awemeid': aweme_id,
'视频时长': f"{minutes}:{seconds:02d}",
'视频链接': video_url,
}
print("开始下载视频...")
download_success = self.downloader.save_video(video_url, aweme_id, video_title)
if download_success:
print("视频信息提取和下载完成!")
else:
print("视频下载失败,但信息已提取")
return video_dict
class DouyinCrawler:
"""抖音爬虫类"""
def __init__(self, browser_path=None):
"""
初始化抖音爬虫
Args:
browser_path: 浏览器路径,默认为 Edge 默认路径
"""
if browser_path is None:
browser_path = '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'
self.browser_path = browser_path
self.driver = None
def setup_browser(self):
"""配置并初始化浏览器"""
co = ChromiumOptions().set_browser_path(self.browser_path)
self.driver = ChromiumPage(co)
def crawl_videos(self, keyword, video_dir):
"""
爬取视频数据
Args:
keyword: 搜索关键词
video_dir: 视频保存目录
"""
print(f"\n=== 开始爬取关键词: {keyword} ===")
print(f"目标保存目录: {video_dir}")
extractor = VideoInfoExtractor(video_dir)
print("设置网络请求监听器...")
self.driver.listen.start('www.douyin.com/aweme/v1/web/search/item', method='GET')
url = f'https://www.douyin.com/search/{keyword}?type=video'
print(f"打开搜索页面: {url}")
self.driver.get(url)
data_list = []
total_videos = 0
print("\n开始爬取视频数据...")
for page in range(10):
print(f"\n=== 第 {page + 1} 页 ===")
print("滚动到页面底部...")
self.driver.scroll.to_bottom()
print("等待网络响应...")
resp = self.driver.listen.wait()
json_data = resp.response.body
print(f"获取到 {len(json_data['data'])} 个视频数据")
for json_aweme_info in json_data['data']:
data = extractor.save_video_info(json_aweme_info['aweme_info'])
data_list.append(data)
total_videos += 1
print(f"当前累计爬取: {total_videos} 个视频")
if not json_data['has_more']:
print("已到达最后一页,停止爬取")
break
wait_time = time.time() % 3 + 2
print(f"等待 {wait_time:.1f} 秒后继续下一页爬取...")
time.sleep(wait_time)
print(f"\n=== 爬取完成 ===")
print(f"总计爬取: {total_videos} 个视频")
# 保存元数据
if data_list:
VideoManager.save_metadata(data_list, video_dir)
else:
print("没有爬取到视频数据,跳过元数据保存")
def close(self):
"""关闭浏览器"""
if self.driver:
self.driver.quit()
class VideoManager:
"""视频管理器类"""
@staticmethod
def create_video_directory(keyword):
"""
创建视频保存目录
Args:
keyword: 搜索关键词
Returns:
str: 视频保存目录路径
"""
video_dir = f"./douyin_videos/{keyword}"
if not os.path.exists(video_dir):
print(f"创建视频保存目录: {video_dir}")
os.makedirs(video_dir)
else:
print(f"使用现有目录: {video_dir}")
return video_dir
@staticmethod
def save_metadata(metadata_list, video_dir):
"""
保存视频元数据到JSON文件
Args:
metadata_list: 视频元数据列表
video_dir: 保存目录路径
"""
import json
metadata_file = os.path.join(video_dir, 'metaData.json')
print(f"\n开始保存元数据到: {metadata_file}")
print(f"{len(metadata_list)} 个视频的元数据")
try:
# 添加保存时间
metadata = {
'save_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()),
'video_count': len(metadata_list),
'videos': metadata_list
}
with open(metadata_file, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
print(f"元数据保存成功!")
print(f"文件大小: {os.path.getsize(metadata_file) / 1024:.2f} KB")
except Exception as e:
print(f"保存元数据时出错: {str(e)}")
def main():
"""主函数"""
if len(sys.argv) < 2:
print("请提供关键词例如python pyauto.py 猫咪")
sys.exit(1)
keyword = sys.argv[1]
# 创建视频保存目录
video_dir = VideoManager.create_video_directory(keyword)
# 创建爬虫实例并开始爬取
crawler = DouyinCrawler()
try:
crawler.setup_browser()
crawler.crawl_videos(keyword, video_dir)
finally:
crawler.close()
if __name__ == "__main__":
main()