Files
toutiao-pachong/douyin_video_crawler.py

304 lines
9.8 KiB
Python
Raw Normal View History

2026-02-07 22:23:36 +08:00
import time
import os
import requests
import sys
from DrissionPage import ChromiumOptions, ChromiumPage
class VideoDownloader:
"""视频下载器类"""
def __init__(self, video_dir):
"""
初始化视频下载器
Args:
video_dir: 视频保存目录
"""
self.video_dir = video_dir
def save_video(self, video_url, aweme_id, video_title):
"""
下载视频到本地
Args:
video_url: 视频下载地址
aweme_id: 视频的唯一标识符
video_title: 视频标题
Returns:
bool: 下载是否成功
"""
try:
# 清理视频标题,移除非法字符
def clean_filename(filename):
invalid_chars = '<>:"/\\|?*'
for char in invalid_chars:
filename = filename.replace(char, '_')
return filename[:50] # 限制文件名长度
cleaned_title = clean_filename(video_title)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36...',
'Referer': 'https://www.douyin.com/'
}
print(f"开始下载视频: {cleaned_title}")
print(f"视频ID: {aweme_id}")
print(f"下载地址: {video_url}")
response = requests.get(video_url, headers=headers, stream=True, timeout=30)
if response.status_code == 200:
video_path = os.path.join(self.video_dir, f'{cleaned_title}_{aweme_id}.mp4')
print(f"保存路径: {video_path}")
total_size = int(response.headers.get('content-length', 0))
downloaded_size = 0
with open(video_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024*1024):
if chunk:
f.write(chunk)
downloaded_size += len(chunk)
if total_size > 0:
progress = (downloaded_size / total_size) * 100
print(f"下载进度: {progress:.1f}%", end='\r')
print(f"\n视频下载完成: {cleaned_title}")
wait_time = time.time() % 2 + 1
print(f"等待 {wait_time:.1f} 秒后继续...")
time.sleep(wait_time)
return True
else:
print(f"下载失败,状态码: {response.status_code}")
return False
except Exception as e:
print(f"下载视频时出错: {str(e)}")
return False
class VideoInfoExtractor:
"""视频信息提取器类"""
def __init__(self, video_dir):
"""
初始化视频信息提取器
Args:
video_dir: 视频保存目录
"""
self.downloader = VideoDownloader(video_dir)
def save_video_info(self, video_data):
"""
提取视频信息并下载视频
Args:
video_data: 包含视频信息的字典
Returns:
dict: 提取的视频信息字典
"""
print("\n开始提取视频信息...")
minutes = video_data['video']['duration'] // 1000 // 60
seconds = video_data['video']['duration'] // 1000 % 60
video_url = video_data['video']['play_addr']['url_list'][0].replace('playwm', 'play')
video_title = video_data['desc'].strip().replace('\n', '')
aweme_id = video_data['aweme_id']
print(f"视频标题: {video_title}")
print(f"视频时长: {minutes}:{seconds:02d}")
print(f"作者: {video_data['author']['nickname'].strip()}")
print(f"粉丝数: {video_data['author']['follower_count']}")
print(f"点赞数: {video_data['statistics']['digg_count']}")
video_dict = {
'用户名': video_data['author']['nickname'].strip(),
'用户uid': 'a' + str(video_data['author']['uid']),
'粉丝数量': video_data['author']['follower_count'],
'视频描述': video_title,
'视频标题': video_title,
'点赞数量': video_data['statistics']['digg_count'],
'视频awemeid': aweme_id,
'视频时长': f"{minutes}:{seconds:02d}",
'视频链接': video_url,
}
print("开始下载视频...")
download_success = self.downloader.save_video(video_url, aweme_id, video_title)
if download_success:
print("视频信息提取和下载完成!")
else:
print("视频下载失败,但信息已提取")
return video_dict
class DouyinCrawler:
"""抖音爬虫类"""
def __init__(self, browser_path=None):
"""
初始化抖音爬虫
Args:
browser_path: 浏览器路径默认为 Edge 默认路径
"""
if browser_path is None:
browser_path = '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'
self.browser_path = browser_path
self.driver = None
def setup_browser(self):
"""配置并初始化浏览器"""
co = ChromiumOptions().set_browser_path(self.browser_path)
self.driver = ChromiumPage(co)
def crawl_videos(self, keyword, video_dir):
"""
爬取视频数据
Args:
keyword: 搜索关键词
video_dir: 视频保存目录
"""
print(f"\n=== 开始爬取关键词: {keyword} ===")
print(f"目标保存目录: {video_dir}")
extractor = VideoInfoExtractor(video_dir)
print("设置网络请求监听器...")
self.driver.listen.start('www.douyin.com/aweme/v1/web/search/item', method='GET')
url = f'https://www.douyin.com/search/{keyword}?type=video'
print(f"打开搜索页面: {url}")
self.driver.get(url)
data_list = []
total_videos = 0
print("\n开始爬取视频数据...")
for page in range(10):
print(f"\n=== 第 {page + 1} 页 ===")
print("滚动到页面底部...")
self.driver.scroll.to_bottom()
print("等待网络响应...")
resp = self.driver.listen.wait()
json_data = resp.response.body
print(f"获取到 {len(json_data['data'])} 个视频数据")
for json_aweme_info in json_data['data']:
data = extractor.save_video_info(json_aweme_info['aweme_info'])
data_list.append(data)
total_videos += 1
print(f"当前累计爬取: {total_videos} 个视频")
if not json_data['has_more']:
print("已到达最后一页,停止爬取")
break
wait_time = time.time() % 3 + 2
print(f"等待 {wait_time:.1f} 秒后继续下一页爬取...")
time.sleep(wait_time)
print(f"\n=== 爬取完成 ===")
print(f"总计爬取: {total_videos} 个视频")
# 保存元数据
if data_list:
VideoManager.save_metadata(data_list, video_dir)
else:
print("没有爬取到视频数据,跳过元数据保存")
def close(self):
"""关闭浏览器"""
if self.driver:
self.driver.quit()
class VideoManager:
"""视频管理器类"""
@staticmethod
def create_video_directory(keyword):
"""
创建视频保存目录
Args:
keyword: 搜索关键词
Returns:
str: 视频保存目录路径
"""
video_dir = f"./douyin_videos/{keyword}"
if not os.path.exists(video_dir):
print(f"创建视频保存目录: {video_dir}")
os.makedirs(video_dir)
else:
print(f"使用现有目录: {video_dir}")
return video_dir
@staticmethod
def save_metadata(metadata_list, video_dir):
"""
保存视频元数据到JSON文件
Args:
metadata_list: 视频元数据列表
video_dir: 保存目录路径
"""
import json
metadata_file = os.path.join(video_dir, 'metaData.json')
print(f"\n开始保存元数据到: {metadata_file}")
print(f"{len(metadata_list)} 个视频的元数据")
try:
# 添加保存时间
metadata = {
'save_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()),
'video_count': len(metadata_list),
'videos': metadata_list
}
with open(metadata_file, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
print(f"元数据保存成功!")
print(f"文件大小: {os.path.getsize(metadata_file) / 1024:.2f} KB")
except Exception as e:
print(f"保存元数据时出错: {str(e)}")
def main():
"""主函数"""
if len(sys.argv) < 2:
print("请提供关键词例如python pyauto.py 猫咪")
sys.exit(1)
keyword = sys.argv[1]
# 创建视频保存目录
video_dir = VideoManager.create_video_directory(keyword)
# 创建爬虫实例并开始爬取
crawler = DouyinCrawler()
try:
crawler.setup_browser()
crawler.crawl_videos(keyword, video_dir)
finally:
crawler.close()
if __name__ == "__main__":
main()