Phase 2RAG and Tool Calling·7 min read

Document Parsing for RAG

Phase 2 of 8

Before you can search your documents, you need to extract text from them. In this guide, you'll learn to parse PDFs, text files, and web pages - the foundation of any RAG system.

Coming from Software Engineering? Document parsing for AI is the same ETL problem you've solved before — extract data from messy sources, transform it, load it somewhere useful. If you've built data pipelines that ingest PDFs, CSVs, or API responses, this is the same skill applied to AI knowledge bases.


The RAG Pipeline

Today we focus on the Parse step!


Parsing Text Files

The easiest format - just read it:

# script_id: day_024_document_parsing/text_and_markdown_parsing
from pathlib import Path

def parse_text_file(file_path: str) -> str:
    """Parse a plain text file."""
    path = Path(file_path)

    if not path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    # Handle different encodings
    encodings = ['utf-8', 'latin-1', 'cp1252']

    for encoding in encodings:
        try:
            return path.read_text(encoding=encoding)
        except UnicodeDecodeError:
            continue

    raise ValueError(f"Could not decode file: {file_path}")

# Usage
content = parse_text_file("document.txt")
print(f"Extracted {len(content)} characters")

Parsing Markdown

# script_id: day_024_document_parsing/text_and_markdown_parsing
import re

def parse_markdown(file_path: str) -> dict:
    """Parse markdown file, extracting structure."""
    content = parse_text_file(file_path)

    # Extract title (first H1)
    title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
    title = title_match.group(1) if title_match else "Untitled"

    # Extract sections
    sections = re.split(r'^##\s+', content, flags=re.MULTILINE)

    # Clean up code blocks for text extraction
    plain_text = re.sub(r'```[\s\S]*?```', '[CODE BLOCK]', content)
    plain_text = re.sub(r'`[^`]+`', '', plain_text)

    return {
        "title": title,
        "raw": content,
        "plain_text": plain_text,
        "sections": len(sections) - 1,  # First split is before first ##
    }

Parsing PDFs

PDFs are trickier - they're designed for display, not extraction.

Recommendation: For new projects, start with PyMuPDF (fitz) as your default PDF parser -- it is the fastest and handles most layouts accurately. Use pdfplumber when you need precise table extraction. PyPDF2 is shown first below for simplicity, but PyMuPDF is the better production choice.

Using PyPDF2

pip install pypdf2
# script_id: day_024_document_parsing/parse_pdf_pypdf2
from pypdf2 import PdfReader

def parse_pdf_pypdf2(file_path: str) -> dict:
    """Parse PDF using PyPDF2."""
    reader = PdfReader(file_path)

    pages = []
    full_text = []

    for i, page in enumerate(reader.pages):
        text = page.extract_text() or ""
        pages.append({
            "page_number": i + 1,
            "text": text
        })
        full_text.append(text)

    return {
        "num_pages": len(reader.pages),
        "pages": pages,
        "full_text": "\n\n".join(full_text),
        "metadata": dict(reader.metadata) if reader.metadata else {}
    }

# Usage
result = parse_pdf_pypdf2("document.pdf")
print(f"Extracted {result['num_pages']} pages")
print(f"Total text: {len(result['full_text'])} characters")

Using pdfplumber (Better for Tables)

pip install pdfplumber
# script_id: day_024_document_parsing/parse_pdf_plumber
import pdfplumber

def parse_pdf_plumber(file_path: str) -> dict:
    """Parse PDF with better table extraction."""
    pages = []
    tables = []

    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages):
            # Extract text
            text = page.extract_text() or ""
            pages.append({
                "page_number": i + 1,
                "text": text
            })

            # Extract tables
            page_tables = page.extract_tables()
            for table in page_tables:
                tables.append({
                    "page": i + 1,
                    "data": table
                })

    return {
        "num_pages": len(pages),
        "pages": pages,
        "tables": tables,
        "full_text": "\n\n".join(p["text"] for p in pages)
    }

Using PyMuPDF (Fastest)

pip install pymupdf
# script_id: day_024_document_parsing/parse_pdf_mupdf
import fitz  # PyMuPDF

def parse_pdf_mupdf(file_path: str) -> dict:
    """Parse PDF using PyMuPDF (fastest option)."""
    doc = fitz.open(file_path)

    pages = []
    for i in range(len(doc)):
        page = doc[i]
        text = page.get_text()
        pages.append({
            "page_number": i + 1,
            "text": text
        })

    return {
        "num_pages": len(doc),
        "pages": pages,
        "full_text": "\n\n".join(p["text"] for p in pages),
        "metadata": doc.metadata
    }

Web Scraping

Extract content from web pages:

Using BeautifulSoup

pip install beautifulsoup4 requests
# script_id: day_024_document_parsing/web_and_universal_loader
import requests
from bs4 import BeautifulSoup

def parse_webpage(url: str) -> dict:
    """Parse webpage content."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; DocumentParser/1.0)'
    }

    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')

    # Remove script and style elements
    for element in soup(['script', 'style', 'nav', 'footer', 'header']):
        element.decompose()

    # Get title
    title = soup.title.string if soup.title else "Untitled"

    # Get main content
    # Try common content containers
    main_content = None
    for selector in ['article', 'main', '.content', '#content', '.post']:
        main_content = soup.select_one(selector)
        if main_content:
            break

    if main_content:
        text = main_content.get_text(separator='\n', strip=True)
    else:
        text = soup.body.get_text(separator='\n', strip=True) if soup.body else ""

    # Extract links
    links = [a.get('href') for a in soup.find_all('a', href=True)]

    return {
        "url": url,
        "title": title,
        "text": text,
        "links": links[:20],  # First 20 links
        "raw_html": response.text
    }

# Usage
result = parse_webpage("https://example.com/article")
print(f"Title: {result['title']}")
print(f"Text length: {len(result['text'])}")

Clean Text Extraction

# script_id: day_024_document_parsing/web_and_universal_loader
import re

def clean_extracted_text(text: str) -> str:
    """Clean up extracted text."""
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove excessive newlines
    text = re.sub(r'\n{3,}', '\n\n', text)

    # Remove common artifacts
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)

    # Strip leading/trailing whitespace
    text = text.strip()

    return text

def extract_main_content(html: str) -> str:
    """Extract main content, removing boilerplate."""
    soup = BeautifulSoup(html, 'html.parser')

    # Remove unwanted elements
    unwanted = [
        'script', 'style', 'nav', 'footer', 'header',
        'aside', 'form', 'iframe', 'noscript'
    ]
    for tag in unwanted:
        for element in soup.find_all(tag):
            element.decompose()

    # Remove elements by class/id patterns
    for element in soup.find_all(class_=re.compile(r'(nav|menu|sidebar|footer|header|ad|comment)')):
        element.decompose()

    text = soup.get_text(separator='\n', strip=True)
    return clean_extracted_text(text)

Universal Document Loader

# script_id: day_024_document_parsing/web_and_universal_loader
from pathlib import Path
from typing import Union
import mimetypes

class DocumentLoader:
    """Universal document loader supporting multiple formats."""

    def __init__(self):
        self.parsers = {
            '.txt': self._parse_text,
            '.md': self._parse_text,
            '.pdf': self._parse_pdf,
            '.html': self._parse_html,
            '.htm': self._parse_html,
        }

    def load(self, source: Union[str, Path]) -> dict:
        """Load document from file path or URL."""
        source = str(source)

        # Check if URL
        if source.startswith(('http://', 'https://')):
            return self._parse_url(source)

        # File path
        path = Path(source)
        suffix = path.suffix.lower()

        if suffix not in self.parsers:
            raise ValueError(f"Unsupported file type: {suffix}")

        return self.parsers[suffix](path)

    def _parse_text(self, path: Path) -> dict:
        content = path.read_text(encoding='utf-8')
        return {
            "source": str(path),
            "type": "text",
            "content": content,
            "metadata": {"filename": path.name}
        }

    def _parse_pdf(self, path: Path) -> dict:
        import fitz
        doc = fitz.open(str(path))
        text = "\n\n".join(page.get_text() for page in doc)
        return {
            "source": str(path),
            "type": "pdf",
            "content": text,
            "metadata": {
                "filename": path.name,
                "pages": len(doc),
                **doc.metadata
            }
        }

    def _parse_html(self, path: Path) -> dict:
        html = path.read_text(encoding='utf-8')
        text = extract_main_content(html)
        return {
            "source": str(path),
            "type": "html",
            "content": text,
            "metadata": {"filename": path.name}
        }

    def _parse_url(self, url: str) -> dict:
        result = parse_webpage(url)
        return {
            "source": url,
            "type": "url",
            "content": result["text"],
            "metadata": {"title": result["title"], "url": url}
        }

# Usage
loader = DocumentLoader()

# Load different formats
doc1 = loader.load("report.pdf")
doc2 = loader.load("article.md")
doc3 = loader.load("https://example.com/page")

for doc in [doc1, doc2, doc3]:
    print(f"{doc['type']}: {len(doc['content'])} chars")

Handling Large Documents

# script_id: day_024_document_parsing/web_and_universal_loader
def load_directory(
    directory: str,
    extensions: list[str] = None,
    max_files: int = None
) -> list[dict]:
    """Load all documents from a directory."""
    path = Path(directory)
    loader = DocumentLoader()

    if extensions is None:
        extensions = ['.txt', '.md', '.pdf', '.html']

    documents = []
    files = list(path.rglob('*'))

    for file_path in files:
        if file_path.suffix.lower() not in extensions:
            continue

        if file_path.is_file():
            try:
                doc = loader.load(file_path)
                documents.append(doc)
                print(f"Loaded: {file_path.name}")

                if max_files and len(documents) >= max_files:
                    break

            except Exception as e:
                print(f"Error loading {file_path}: {e}")

    return documents

# Usage
docs = load_directory("./documents", extensions=[".pdf", ".txt"], max_files=100)
print(f"Loaded {len(docs)} documents")

Summary


Quick Reference

# script_id: day_024_document_parsing/quick_reference
# Text files
content = Path("file.txt").read_text()

# PDFs (PyMuPDF)
import fitz
doc = fitz.open("file.pdf")
text = "\n".join(page.get_text() for page in doc)

# Web pages
from bs4 import BeautifulSoup
import requests
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text()

What's Next?

Now that you can extract text, let's learn Text Chunking Strategies - how to split documents into optimal pieces for retrieval!