import time import os import requests import sys from DrissionPage import ChromiumOptions, ChromiumPage class VideoDownloader: """视频下载器类""" def __init__(self, video_dir): """ 初始化视频下载器 Args: video_dir: 视频保存目录 """ self.video_dir = video_dir def save_video(self, video_url, aweme_id, video_title): """ 下载视频到本地 Args: video_url: 视频下载地址 aweme_id: 视频的唯一标识符 video_title: 视频标题 Returns: bool: 下载是否成功 """ try: # 清理视频标题,移除非法字符 def clean_filename(filename): invalid_chars = '<>:"/\\|?*' for char in invalid_chars: filename = filename.replace(char, '_') return filename[:50] # 限制文件名长度 cleaned_title = clean_filename(video_title) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36...', 'Referer': 'https://www.douyin.com/' } print(f"开始下载视频: {cleaned_title}") print(f"视频ID: {aweme_id}") print(f"下载地址: {video_url}") response = requests.get(video_url, headers=headers, stream=True, timeout=30) if response.status_code == 200: video_path = os.path.join(self.video_dir, f'{cleaned_title}_{aweme_id}.mp4') print(f"保存路径: {video_path}") total_size = int(response.headers.get('content-length', 0)) downloaded_size = 0 with open(video_path, 'wb') as f: for chunk in response.iter_content(chunk_size=1024*1024): if chunk: f.write(chunk) downloaded_size += len(chunk) if total_size > 0: progress = (downloaded_size / total_size) * 100 print(f"下载进度: {progress:.1f}%", end='\r') print(f"\n视频下载完成: {cleaned_title}") wait_time = time.time() % 2 + 1 print(f"等待 {wait_time:.1f} 秒后继续...") time.sleep(wait_time) return True else: print(f"下载失败,状态码: {response.status_code}") return False except Exception as e: print(f"下载视频时出错: {str(e)}") return False class VideoInfoExtractor: """视频信息提取器类""" def __init__(self, video_dir): """ 初始化视频信息提取器 Args: video_dir: 视频保存目录 """ self.downloader = VideoDownloader(video_dir) def save_video_info(self, video_data): """ 提取视频信息并下载视频 Args: video_data: 包含视频信息的字典 Returns: dict: 提取的视频信息字典 """ print("\n开始提取视频信息...") minutes = video_data['video']['duration'] // 1000 // 60 seconds = video_data['video']['duration'] // 1000 % 60 video_url = video_data['video']['play_addr']['url_list'][0].replace('playwm', 'play') video_title = video_data['desc'].strip().replace('\n', '') aweme_id = video_data['aweme_id'] print(f"视频标题: {video_title}") print(f"视频时长: {minutes}:{seconds:02d}") print(f"作者: {video_data['author']['nickname'].strip()}") print(f"粉丝数: {video_data['author']['follower_count']}") print(f"点赞数: {video_data['statistics']['digg_count']}") video_dict = { '用户名': video_data['author']['nickname'].strip(), '用户uid': 'a' + str(video_data['author']['uid']), '粉丝数量': video_data['author']['follower_count'], '视频描述': video_title, '视频标题': video_title, '点赞数量': video_data['statistics']['digg_count'], '视频awemeid': aweme_id, '视频时长': f"{minutes}:{seconds:02d}", '视频链接': video_url, } print("开始下载视频...") download_success = self.downloader.save_video(video_url, aweme_id, video_title) if download_success: print("视频信息提取和下载完成!") else: print("视频下载失败,但信息已提取") return video_dict class DouyinCrawler: """抖音爬虫类""" def __init__(self, browser_path=None): """ 初始化抖音爬虫 Args: browser_path: 浏览器路径,默认为 Edge 默认路径 """ if browser_path is None: browser_path = '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge' self.browser_path = browser_path self.driver = None def setup_browser(self): """配置并初始化浏览器""" co = ChromiumOptions().set_browser_path(self.browser_path) self.driver = ChromiumPage(co) def crawl_videos(self, keyword, video_dir): """ 爬取视频数据 Args: keyword: 搜索关键词 video_dir: 视频保存目录 """ print(f"\n=== 开始爬取关键词: {keyword} ===") print(f"目标保存目录: {video_dir}") extractor = VideoInfoExtractor(video_dir) print("设置网络请求监听器...") self.driver.listen.start('www.douyin.com/aweme/v1/web/search/item', method='GET') url = f'https://www.douyin.com/search/{keyword}?type=video' print(f"打开搜索页面: {url}") self.driver.get(url) data_list = [] total_videos = 0 print("\n开始爬取视频数据...") for page in range(10): print(f"\n=== 第 {page + 1} 页 ===") print("滚动到页面底部...") self.driver.scroll.to_bottom() print("等待网络响应...") resp = self.driver.listen.wait() json_data = resp.response.body print(f"获取到 {len(json_data['data'])} 个视频数据") for json_aweme_info in json_data['data']: data = extractor.save_video_info(json_aweme_info['aweme_info']) data_list.append(data) total_videos += 1 print(f"当前累计爬取: {total_videos} 个视频") if not json_data['has_more']: print("已到达最后一页,停止爬取") break wait_time = time.time() % 3 + 2 print(f"等待 {wait_time:.1f} 秒后继续下一页爬取...") time.sleep(wait_time) print(f"\n=== 爬取完成 ===") print(f"总计爬取: {total_videos} 个视频") # 保存元数据 if data_list: VideoManager.save_metadata(data_list, video_dir) else: print("没有爬取到视频数据,跳过元数据保存") def close(self): """关闭浏览器""" if self.driver: self.driver.quit() class VideoManager: """视频管理器类""" @staticmethod def create_video_directory(keyword): """ 创建视频保存目录 Args: keyword: 搜索关键词 Returns: str: 视频保存目录路径 """ video_dir = f"./douyin_videos/{keyword}" if not os.path.exists(video_dir): print(f"创建视频保存目录: {video_dir}") os.makedirs(video_dir) else: print(f"使用现有目录: {video_dir}") return video_dir @staticmethod def save_metadata(metadata_list, video_dir): """ 保存视频元数据到JSON文件 Args: metadata_list: 视频元数据列表 video_dir: 保存目录路径 """ import json metadata_file = os.path.join(video_dir, 'metaData.json') print(f"\n开始保存元数据到: {metadata_file}") print(f"共 {len(metadata_list)} 个视频的元数据") try: # 添加保存时间 metadata = { 'save_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), 'video_count': len(metadata_list), 'videos': metadata_list } with open(metadata_file, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) print(f"元数据保存成功!") print(f"文件大小: {os.path.getsize(metadata_file) / 1024:.2f} KB") except Exception as e: print(f"保存元数据时出错: {str(e)}") def main(): """主函数""" if len(sys.argv) < 2: print("请提供关键词,例如:python pyauto.py 猫咪") sys.exit(1) keyword = sys.argv[1] # 创建视频保存目录 video_dir = VideoManager.create_video_directory(keyword) # 创建爬虫实例并开始爬取 crawler = DouyinCrawler() try: crawler.setup_browser() crawler.crawl_videos(keyword, video_dir) finally: crawler.close() if __name__ == "__main__": main()