toutiao-pachong/douyin_video_crawler.py

import time
import os
import requests
import sys
from DrissionPage import ChromiumOptions, ChromiumPage


class VideoDownloader:
    """视频下载器类"""
    
    def __init__(self, video_dir):
        """
        初始化视频下载器
        
        Args:
            video_dir: 视频保存目录
        """
        self.video_dir = video_dir
    
    def save_video(self, video_url, aweme_id, video_title):
        """
        下载视频到本地
        
        Args:
            video_url: 视频下载地址
            aweme_id: 视频的唯一标识符
            video_title: 视频标题
            
        Returns:
            bool: 下载是否成功
        """
        try:
            # 清理视频标题，移除非法字符
            def clean_filename(filename):
                invalid_chars = '<>:"/\\|?*'
                for char in invalid_chars:
                    filename = filename.replace(char, '_')
                return filename[:50]  # 限制文件名长度
            
            cleaned_title = clean_filename(video_title)
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36...',
                'Referer': 'https://www.douyin.com/'
            }
            
            print(f"开始下载视频: {cleaned_title}")
            print(f"视频ID: {aweme_id}")
            print(f"下载地址: {video_url}")
            
            response = requests.get(video_url, headers=headers, stream=True, timeout=30)
            
            if response.status_code == 200:
                video_path = os.path.join(self.video_dir, f'{cleaned_title}_{aweme_id}.mp4')
                print(f"保存路径: {video_path}")
                
                total_size = int(response.headers.get('content-length', 0))
                downloaded_size = 0
                
                with open(video_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=1024*1024):
                        if chunk:
                            f.write(chunk)
                            downloaded_size += len(chunk)
                            if total_size > 0:
                                progress = (downloaded_size / total_size) * 100
                                print(f"下载进度: {progress:.1f}%", end='\r')
                
                print(f"\n视频下载完成: {cleaned_title}")
                
                wait_time = time.time() % 2 + 1
                print(f"等待 {wait_time:.1f} 秒后继续...")
                time.sleep(wait_time)
                return True
            else:
                print(f"下载失败，状态码: {response.status_code}")
                return False
        except Exception as e:
            print(f"下载视频时出错: {str(e)}")
            return False


class VideoInfoExtractor:
    """视频信息提取器类"""
    
    def __init__(self, video_dir):
        """
        初始化视频信息提取器
        
        Args:
            video_dir: 视频保存目录
        """
        self.downloader = VideoDownloader(video_dir)
    
    def save_video_info(self, video_data):
        """
        提取视频信息并下载视频
        
        Args:
            video_data: 包含视频信息的字典
            
        Returns:
            dict: 提取的视频信息字典
        """
        print("\n开始提取视频信息...")
        
        minutes = video_data['video']['duration'] // 1000 // 60
        seconds = video_data['video']['duration'] // 1000 % 60
        
        video_url = video_data['video']['play_addr']['url_list'][0].replace('playwm', 'play')
        video_title = video_data['desc'].strip().replace('\n', '')
        aweme_id = video_data['aweme_id']
        
        print(f"视频标题: {video_title}")
        print(f"视频时长: {minutes}:{seconds:02d}")
        print(f"作者: {video_data['author']['nickname'].strip()}")
        print(f"粉丝数: {video_data['author']['follower_count']}")
        print(f"点赞数: {video_data['statistics']['digg_count']}")
        
        video_dict = {
            '用户名': video_data['author']['nickname'].strip(),
            '用户uid': 'a' + str(video_data['author']['uid']),
            '粉丝数量': video_data['author']['follower_count'],
            '视频描述': video_title,
            '视频标题': video_title,
            '点赞数量': video_data['statistics']['digg_count'],
            '视频awemeid': aweme_id,
            '视频时长': f"{minutes}:{seconds:02d}",
            '视频链接': video_url,
        }
        
        print("开始下载视频...")
        download_success = self.downloader.save_video(video_url, aweme_id, video_title)
        
        if download_success:
            print("视频信息提取和下载完成！")
        else:
            print("视频下载失败，但信息已提取")
        
        return video_dict


class DouyinCrawler:
    """抖音爬虫类"""
    
    def __init__(self, browser_path=None):
        """
        初始化抖音爬虫
        
        Args:
            browser_path: 浏览器路径，默认为 Edge 默认路径
        """
        if browser_path is None:
            browser_path = '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'
        self.browser_path = browser_path
        self.driver = None
    
    def setup_browser(self):
        """配置并初始化浏览器"""
        co = ChromiumOptions().set_browser_path(self.browser_path)
        self.driver = ChromiumPage(co)
    
    def crawl_videos(self, keyword, video_dir):
        """
        爬取视频数据
        
        Args:
            keyword: 搜索关键词
            video_dir: 视频保存目录
        """
        print(f"\n=== 开始爬取关键词: {keyword} ===")
        print(f"目标保存目录: {video_dir}")
        
        extractor = VideoInfoExtractor(video_dir)
        
        print("设置网络请求监听器...")
        self.driver.listen.start('www.douyin.com/aweme/v1/web/search/item', method='GET')
        
        url = f'https://www.douyin.com/search/{keyword}?type=video'
        print(f"打开搜索页面: {url}")
        self.driver.get(url)
        
        data_list = []
        total_videos = 0
        
        print("\n开始爬取视频数据...")
        for page in range(10):
            print(f"\n=== 第 {page + 1} 页 ===")
            print("滚动到页面底部...")
            self.driver.scroll.to_bottom()
            
            print("等待网络响应...")
            resp = self.driver.listen.wait()
            json_data = resp.response.body
            
            print(f"获取到 {len(json_data['data'])} 个视频数据")
            
            for json_aweme_info in json_data['data']:
                data = extractor.save_video_info(json_aweme_info['aweme_info'])
                data_list.append(data)
                total_videos += 1
            
            print(f"当前累计爬取: {total_videos} 个视频")
            
            if not json_data['has_more']:
                print("已到达最后一页，停止爬取")
                break
            
            wait_time = time.time() % 3 + 2
            print(f"等待 {wait_time:.1f} 秒后继续下一页爬取...")
            time.sleep(wait_time)
        
        print(f"\n=== 爬取完成 ===")
        print(f"总计爬取: {total_videos} 个视频")
        
        # 保存元数据
        if data_list:
            VideoManager.save_metadata(data_list, video_dir)
        else:
            print("没有爬取到视频数据，跳过元数据保存")
    
    def close(self):
        """关闭浏览器"""
        if self.driver:
            self.driver.quit()


class VideoManager:
    """视频管理器类"""
    
    @staticmethod
    def create_video_directory(keyword):
        """
        创建视频保存目录
        
        Args:
            keyword: 搜索关键词
            
        Returns:
            str: 视频保存目录路径
        """
        video_dir = f"./douyin_videos/{keyword}"
        if not os.path.exists(video_dir):
            print(f"创建视频保存目录: {video_dir}")
            os.makedirs(video_dir)
        else:
            print(f"使用现有目录: {video_dir}")
        return video_dir
    
    @staticmethod
    def save_metadata(metadata_list, video_dir):
        """
        保存视频元数据到JSON文件
        
        Args:
            metadata_list: 视频元数据列表
            video_dir: 保存目录路径
        """
        import json
        
        metadata_file = os.path.join(video_dir, 'metaData.json')
        print(f"\n开始保存元数据到: {metadata_file}")
        print(f"共 {len(metadata_list)} 个视频的元数据")
        
        try:
            # 添加保存时间
            metadata = {
                'save_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()),
                'video_count': len(metadata_list),
                'videos': metadata_list
            }
            
            with open(metadata_file, 'w', encoding='utf-8') as f:
                json.dump(metadata, f, ensure_ascii=False, indent=2)
            
            print(f"元数据保存成功！")
            print(f"文件大小: {os.path.getsize(metadata_file) / 1024:.2f} KB")
        except Exception as e:
            print(f"保存元数据时出错: {str(e)}")


def main():
    """主函数"""
    if len(sys.argv) < 2:
        print("请提供关键词，例如：python pyauto.py 猫咪")
        sys.exit(1)

    keyword = sys.argv[1]
    
    # 创建视频保存目录
    video_dir = VideoManager.create_video_directory(keyword)
    
    # 创建爬虫实例并开始爬取
    crawler = DouyinCrawler()
    
    try:
        crawler.setup_browser()
        crawler.crawl_videos(keyword, video_dir)
    finally:
        crawler.close()


if __name__ == "__main__":
    main()