DEV Community

Cover image for Fetch Youtube video transcripts of travel creators
Yogesh Bansal
Yogesh Bansal

Posted on

Fetch Youtube video transcripts of travel creators



import requests
import csv
import time
import pandas as pd

API_KEY = 'YOUR_API_KEY'  # Replace with your actual YouTube Data API key

# List of search keywords related to travel
SEARCH_KEYWORDS = [
    'travel vlog',
    'travel guide',
    'food travel',
    'adventure travel',
    'backpacking',
    'cultural travel experiences',
    'travel tips',
    'travel destinations',
    'travel blogger',
    'world travel',
    'wanderlust',
    'exploring the world',
    'travel channel',
    'travel documentary',
    'road trips'
]

# Base URL for YouTube Data API
BASE_URL = 'https://www.googleapis.com/youtube/v3'

def collect_travel_channels(api_key, search_keywords):
    """Collects travel-related YouTube channels based on search keywords."""
    channels = []
    channel_ids_set = set()

    for keyword in search_keywords:
        print(f"Searching for keyword: {keyword}")
        params = {
            'part': 'snippet',
            'q': keyword,
            'type': 'channel',
            'maxResults': 50,
            'key': api_key
        }
        response = requests.get(f"{BASE_URL}/search", params=params)
        result = response.json()

        if 'items' in result:
            for item in result['items']:
                channel_id = item['snippet']['channelId']
                if channel_id not in channel_ids_set:
                    channel_ids_set.add(channel_id)
                    channels.append({
                        'channelId': channel_id,
                        'channelTitle': item['snippet']['channelTitle'],
                        'description': item['snippet']['description']
                    })
        else:
            print(f"Error in response: {result}")

        # Pause to respect API rate limits
        time.sleep(1)

    # Save channels to CSV
    channels_df = pd.DataFrame(channels)
    channels_df.to_csv('travel_channels.csv', index=False, encoding='utf-8')
    print("Finished collecting channel data.")

def collect_channel_videos(api_key):
    """Retrieves video lists for each channel."""
    channels_df = pd.read_csv('travel_channels.csv', encoding='utf-8')
    videos = []

    for index, row in channels_df.iterrows():
        channel_id = row['channelId']
        channel_title = row['channelTitle']
        print(f"Processing channel: {channel_title} (ID: {channel_id})")

        # Get uploads playlist ID
        uploads_playlist_id = get_uploads_playlist_id(api_key, channel_id)
        if uploads_playlist_id:
            # Get videos from playlist
            channel_videos = get_videos_from_playlist(api_key, uploads_playlist_id)
            for video in channel_videos:
                videos.append({
                    'channelId': channel_id,
                    'channelTitle': channel_title,
                    'videoId': video['videoId'],
                    'videoTitle': video['videoTitle'],
                    'publishedAt': video['publishedAt']
                })
        else:
            print(f"Skipping channel {channel_title} due to missing uploads playlist.")

        # Pause between channels
        time.sleep(1)

    # Save videos to CSV
    videos_df = pd.DataFrame(videos)
    videos_df.to_csv('channel_videos.csv', index=False, encoding='utf-8')
    print("Finished collecting video data.")

def get_uploads_playlist_id(api_key, channel_id):
    """Retrieves the uploads playlist ID for a given channel."""
    params = {
        'part': 'contentDetails',
        'id': channel_id,
        'key': api_key
    }
    response = requests.get(f"{BASE_URL}/channels", params=params)
    result = response.json()

    if 'items' in result and len(result['items']) > 0:
        uploads_playlist_id = result['items'][0]['contentDetails']['relatedPlaylists']['uploads']
        return uploads_playlist_id
    else:
        print(f"Could not get uploads playlist for channel ID: {channel_id}")
        return None

def get_videos_from_playlist(api_key, playlist_id):
    """Retrieves all videos from a playlist."""
    videos = []
    params = {
        'part': 'snippet,contentDetails',
        'playlistId': playlist_id,
        'maxResults': 50,
        'key': api_key
    }

    while True:
        response = requests.get(f"{BASE_URL}/playlistItems", params=params)
        result = response.json()

        if 'items' in result:
            for item in result['items']:
                video_id = item['contentDetails']['videoId']
                video_title = item['snippet']['title']
                published_at = item['contentDetails']['videoPublishedAt']

                videos.append({
                    'videoId': video_id,
                    'videoTitle': video_title,
                    'publishedAt': published_at
                })

            if 'nextPageToken' in result:
                params['pageToken'] = result['nextPageToken']
                time.sleep(0.5)
            else:
                break
        else:
            print(f"Error retrieving videos: {result}")
            break

    return videos

def main():
    collect_travel_channels(API_KEY, SEARCH_KEYWORDS)
    collect_channel_videos(API_KEY)
    print("Data collection complete.")

if __name__ == '__main__':
    main()


Enter fullscreen mode Exit fullscreen mode

Top comments (0)