YouTube videos contain massive amounts of unstructured knowledge that never appears in text search results. Extracting transcripts and storing them in MongoDB creates a searchable knowledge base from video content that your agents and RAG pipelines can query directly.
The Pipeline
Step 1: Search YouTube for videos on your target topics. Step 2: Extract transcripts from each video. Step 3: Chunk transcripts into searchable segments. Step 4: Store in MongoDB with text indexes. Step 5: Query your video KB alongside web search results.
Step 1: Finding Relevant Videos
import requests, os
H = {"x-api-key": os.environ["SCAVIO_API_KEY"]}
def find_videos(topic, max_results=10):
"""Search YouTube for videos on a topic."""
r = requests.post("https://api.scavio.dev/api/v1/search",
headers=H,
json={"platform": "youtube", "query": topic},
timeout=10
).json()
videos = []
for item in r.get("organic", [])[:max_results]:
videos.append({
"title": item.get("title", ""),
"url": item.get("link", ""),
"snippet": item.get("snippet", ""),
"channel": item.get("channel", ""),
})
return videos
videos = find_videos("search api tutorial for developers 2026")
for v in videos:
print(f"{v['title']} - {v['url']}")Step 2: Transcript Extraction
from youtube_transcript_api import YouTubeTranscriptApi
from urllib.parse import urlparse, parse_qs
def extract_video_id(url):
"""Extract video ID from YouTube URL."""
parsed = urlparse(url)
if parsed.hostname == "youtu.be":
return parsed.path[1:]
qs = parse_qs(parsed.query)
return qs.get("v", [None])[0]
def get_transcript(video_url):
"""Get transcript from a YouTube video."""
video_id = extract_video_id(video_url)
if not video_id:
return None
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
return {
"video_id": video_id,
"segments": transcript,
"full_text": " ".join([s["text"] for s in transcript]),
}
except Exception as e:
print(f"Transcript unavailable for {video_id}: {e}")
return NoneStep 3: Chunking for Search
def chunk_transcript(transcript_data, chunk_size=500):
"""Split transcript into searchable chunks."""
if not transcript_data:
return []
words = transcript_data["full_text"].split()
chunks = []
for i in range(0, len(words), chunk_size):
chunk_words = words[i:i + chunk_size]
# Find timestamp for this chunk
char_pos = len(" ".join(words[:i]))
timestamp = 0
running = 0
for seg in transcript_data["segments"]:
running += len(seg["text"]) + 1
if running >= char_pos:
timestamp = seg["start"]
break
chunks.append({
"text": " ".join(chunk_words),
"video_id": transcript_data["video_id"],
"timestamp": timestamp,
"chunk_index": i // chunk_size,
})
return chunksStep 4: MongoDB Storage
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client["video_kb"]
chunks_col = db["transcript_chunks"]
# Create text index
chunks_col.create_index([("text", "text")])
def ingest_video(video_info, transcript):
"""Store chunked transcript in MongoDB."""
video_chunks = chunk_transcript(transcript)
for chunk in video_chunks:
chunk["video_title"] = video_info["title"]
chunk["video_url"] = video_info["url"]
chunk["channel"] = video_info.get("channel", "")
if video_chunks:
chunks_col.insert_many(video_chunks)
return len(video_chunks)
# Ingest all videos for a topic
for video in videos:
transcript = get_transcript(video["url"])
if transcript:
count = ingest_video(video, transcript)
print(f"Ingested {count} chunks from: {video['title']}")Step 5: Querying the Video KB
def search_video_kb(query, limit=5):
"""Search transcript chunks in MongoDB."""
results = chunks_col.find(
{"$text": {"$search": query}},
{"score": {"$meta": "textScore"}}
).sort([("score", {"$meta": "textScore"})]).limit(limit)
return [
{
"text": r["text"][:300],
"video": r["video_title"],
"url": f"{r['video_url']}&t={int(r['timestamp'])}",
"score": r["score"],
}
for r in results
]
# Search across all ingested video transcripts
results = search_video_kb("how to set up search api")
for r in results:
print(f"{r['video']} (score: {r['score']:.2f})")
print(f" {r['url']}")
print(f" {r['text'][:100]}...")Combining Video KB with Web Search
For RAG pipelines, search your video KB first for domain-specific knowledge, then supplement with web search for current information. Video transcripts often contain practical, step-by-step knowledge that blog posts summarize but videos demonstrate in depth.