304 lines
9.8 KiB
Python
304 lines
9.8 KiB
Python
import time
|
||
import os
|
||
import requests
|
||
import sys
|
||
from DrissionPage import ChromiumOptions, ChromiumPage
|
||
|
||
|
||
class VideoDownloader:
|
||
"""视频下载器类"""
|
||
|
||
def __init__(self, video_dir):
|
||
"""
|
||
初始化视频下载器
|
||
|
||
Args:
|
||
video_dir: 视频保存目录
|
||
"""
|
||
self.video_dir = video_dir
|
||
|
||
def save_video(self, video_url, aweme_id, video_title):
|
||
"""
|
||
下载视频到本地
|
||
|
||
Args:
|
||
video_url: 视频下载地址
|
||
aweme_id: 视频的唯一标识符
|
||
video_title: 视频标题
|
||
|
||
Returns:
|
||
bool: 下载是否成功
|
||
"""
|
||
try:
|
||
# 清理视频标题,移除非法字符
|
||
def clean_filename(filename):
|
||
invalid_chars = '<>:"/\\|?*'
|
||
for char in invalid_chars:
|
||
filename = filename.replace(char, '_')
|
||
return filename[:50] # 限制文件名长度
|
||
|
||
cleaned_title = clean_filename(video_title)
|
||
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36...',
|
||
'Referer': 'https://www.douyin.com/'
|
||
}
|
||
|
||
print(f"开始下载视频: {cleaned_title}")
|
||
print(f"视频ID: {aweme_id}")
|
||
print(f"下载地址: {video_url}")
|
||
|
||
response = requests.get(video_url, headers=headers, stream=True, timeout=30)
|
||
|
||
if response.status_code == 200:
|
||
video_path = os.path.join(self.video_dir, f'{cleaned_title}_{aweme_id}.mp4')
|
||
print(f"保存路径: {video_path}")
|
||
|
||
total_size = int(response.headers.get('content-length', 0))
|
||
downloaded_size = 0
|
||
|
||
with open(video_path, 'wb') as f:
|
||
for chunk in response.iter_content(chunk_size=1024*1024):
|
||
if chunk:
|
||
f.write(chunk)
|
||
downloaded_size += len(chunk)
|
||
if total_size > 0:
|
||
progress = (downloaded_size / total_size) * 100
|
||
print(f"下载进度: {progress:.1f}%", end='\r')
|
||
|
||
print(f"\n视频下载完成: {cleaned_title}")
|
||
|
||
wait_time = time.time() % 2 + 1
|
||
print(f"等待 {wait_time:.1f} 秒后继续...")
|
||
time.sleep(wait_time)
|
||
return True
|
||
else:
|
||
print(f"下载失败,状态码: {response.status_code}")
|
||
return False
|
||
except Exception as e:
|
||
print(f"下载视频时出错: {str(e)}")
|
||
return False
|
||
|
||
|
||
class VideoInfoExtractor:
|
||
"""视频信息提取器类"""
|
||
|
||
def __init__(self, video_dir):
|
||
"""
|
||
初始化视频信息提取器
|
||
|
||
Args:
|
||
video_dir: 视频保存目录
|
||
"""
|
||
self.downloader = VideoDownloader(video_dir)
|
||
|
||
def save_video_info(self, video_data):
|
||
"""
|
||
提取视频信息并下载视频
|
||
|
||
Args:
|
||
video_data: 包含视频信息的字典
|
||
|
||
Returns:
|
||
dict: 提取的视频信息字典
|
||
"""
|
||
print("\n开始提取视频信息...")
|
||
|
||
minutes = video_data['video']['duration'] // 1000 // 60
|
||
seconds = video_data['video']['duration'] // 1000 % 60
|
||
|
||
video_url = video_data['video']['play_addr']['url_list'][0].replace('playwm', 'play')
|
||
video_title = video_data['desc'].strip().replace('\n', '')
|
||
aweme_id = video_data['aweme_id']
|
||
|
||
print(f"视频标题: {video_title}")
|
||
print(f"视频时长: {minutes}:{seconds:02d}")
|
||
print(f"作者: {video_data['author']['nickname'].strip()}")
|
||
print(f"粉丝数: {video_data['author']['follower_count']}")
|
||
print(f"点赞数: {video_data['statistics']['digg_count']}")
|
||
|
||
video_dict = {
|
||
'用户名': video_data['author']['nickname'].strip(),
|
||
'用户uid': 'a' + str(video_data['author']['uid']),
|
||
'粉丝数量': video_data['author']['follower_count'],
|
||
'视频描述': video_title,
|
||
'视频标题': video_title,
|
||
'点赞数量': video_data['statistics']['digg_count'],
|
||
'视频awemeid': aweme_id,
|
||
'视频时长': f"{minutes}:{seconds:02d}",
|
||
'视频链接': video_url,
|
||
}
|
||
|
||
print("开始下载视频...")
|
||
download_success = self.downloader.save_video(video_url, aweme_id, video_title)
|
||
|
||
if download_success:
|
||
print("视频信息提取和下载完成!")
|
||
else:
|
||
print("视频下载失败,但信息已提取")
|
||
|
||
return video_dict
|
||
|
||
|
||
class DouyinCrawler:
|
||
"""抖音爬虫类"""
|
||
|
||
def __init__(self, browser_path=None):
|
||
"""
|
||
初始化抖音爬虫
|
||
|
||
Args:
|
||
browser_path: 浏览器路径,默认为 Edge 默认路径
|
||
"""
|
||
if browser_path is None:
|
||
browser_path = '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'
|
||
self.browser_path = browser_path
|
||
self.driver = None
|
||
|
||
def setup_browser(self):
|
||
"""配置并初始化浏览器"""
|
||
co = ChromiumOptions().set_browser_path(self.browser_path)
|
||
self.driver = ChromiumPage(co)
|
||
|
||
def crawl_videos(self, keyword, video_dir):
|
||
"""
|
||
爬取视频数据
|
||
|
||
Args:
|
||
keyword: 搜索关键词
|
||
video_dir: 视频保存目录
|
||
"""
|
||
print(f"\n=== 开始爬取关键词: {keyword} ===")
|
||
print(f"目标保存目录: {video_dir}")
|
||
|
||
extractor = VideoInfoExtractor(video_dir)
|
||
|
||
print("设置网络请求监听器...")
|
||
self.driver.listen.start('www.douyin.com/aweme/v1/web/search/item', method='GET')
|
||
|
||
url = f'https://www.douyin.com/search/{keyword}?type=video'
|
||
print(f"打开搜索页面: {url}")
|
||
self.driver.get(url)
|
||
|
||
data_list = []
|
||
total_videos = 0
|
||
|
||
print("\n开始爬取视频数据...")
|
||
for page in range(10):
|
||
print(f"\n=== 第 {page + 1} 页 ===")
|
||
print("滚动到页面底部...")
|
||
self.driver.scroll.to_bottom()
|
||
|
||
print("等待网络响应...")
|
||
resp = self.driver.listen.wait()
|
||
json_data = resp.response.body
|
||
|
||
print(f"获取到 {len(json_data['data'])} 个视频数据")
|
||
|
||
for json_aweme_info in json_data['data']:
|
||
data = extractor.save_video_info(json_aweme_info['aweme_info'])
|
||
data_list.append(data)
|
||
total_videos += 1
|
||
|
||
print(f"当前累计爬取: {total_videos} 个视频")
|
||
|
||
if not json_data['has_more']:
|
||
print("已到达最后一页,停止爬取")
|
||
break
|
||
|
||
wait_time = time.time() % 3 + 2
|
||
print(f"等待 {wait_time:.1f} 秒后继续下一页爬取...")
|
||
time.sleep(wait_time)
|
||
|
||
print(f"\n=== 爬取完成 ===")
|
||
print(f"总计爬取: {total_videos} 个视频")
|
||
|
||
# 保存元数据
|
||
if data_list:
|
||
VideoManager.save_metadata(data_list, video_dir)
|
||
else:
|
||
print("没有爬取到视频数据,跳过元数据保存")
|
||
|
||
def close(self):
|
||
"""关闭浏览器"""
|
||
if self.driver:
|
||
self.driver.quit()
|
||
|
||
|
||
class VideoManager:
|
||
"""视频管理器类"""
|
||
|
||
@staticmethod
|
||
def create_video_directory(keyword):
|
||
"""
|
||
创建视频保存目录
|
||
|
||
Args:
|
||
keyword: 搜索关键词
|
||
|
||
Returns:
|
||
str: 视频保存目录路径
|
||
"""
|
||
video_dir = f"./douyin_videos/{keyword}"
|
||
if not os.path.exists(video_dir):
|
||
print(f"创建视频保存目录: {video_dir}")
|
||
os.makedirs(video_dir)
|
||
else:
|
||
print(f"使用现有目录: {video_dir}")
|
||
return video_dir
|
||
|
||
@staticmethod
|
||
def save_metadata(metadata_list, video_dir):
|
||
"""
|
||
保存视频元数据到JSON文件
|
||
|
||
Args:
|
||
metadata_list: 视频元数据列表
|
||
video_dir: 保存目录路径
|
||
"""
|
||
import json
|
||
|
||
metadata_file = os.path.join(video_dir, 'metaData.json')
|
||
print(f"\n开始保存元数据到: {metadata_file}")
|
||
print(f"共 {len(metadata_list)} 个视频的元数据")
|
||
|
||
try:
|
||
# 添加保存时间
|
||
metadata = {
|
||
'save_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()),
|
||
'video_count': len(metadata_list),
|
||
'videos': metadata_list
|
||
}
|
||
|
||
with open(metadata_file, 'w', encoding='utf-8') as f:
|
||
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"元数据保存成功!")
|
||
print(f"文件大小: {os.path.getsize(metadata_file) / 1024:.2f} KB")
|
||
except Exception as e:
|
||
print(f"保存元数据时出错: {str(e)}")
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
if len(sys.argv) < 2:
|
||
print("请提供关键词,例如:python pyauto.py 猫咪")
|
||
sys.exit(1)
|
||
|
||
keyword = sys.argv[1]
|
||
|
||
# 创建视频保存目录
|
||
video_dir = VideoManager.create_video_directory(keyword)
|
||
|
||
# 创建爬虫实例并开始爬取
|
||
crawler = DouyinCrawler()
|
||
|
||
try:
|
||
crawler.setup_browser()
|
||
crawler.crawl_videos(keyword, video_dir)
|
||
finally:
|
||
crawler.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |