Skip to content

Utilities

Utility functions and classes for file handling and other operations.

FileReader

Handles reading different file formats with a unified approach for unstructured documents.

For unstructured documents (TXT, DOCX, PDF), the default strategy is to convert everything to PDF and use instructor's multimodal PDF support. This eliminates the need for manual chunking and provides the best context preservation.

Source code in structx/utils/file_reader.py
class FileReader:
    """
    Handles reading different file formats with a unified approach for unstructured documents.

    For unstructured documents (TXT, DOCX, PDF), the default strategy is to convert
    everything to PDF and use instructor's multimodal PDF support. This eliminates
    the need for manual chunking and provides the best context preservation.
    """

    STRUCTURED_EXTENSIONS: Dict[
        str, Callable[[Union[str, Path], Dict], pd.DataFrame]
    ] = {
        ".csv": pd.read_csv,
        ".xlsx": pd.read_excel,
        ".xls": pd.read_excel,
        ".json": pd.read_json,
        ".parquet": pd.read_parquet,
        ".feather": pd.read_feather,
    }

    TEXT_EXTENSIONS: List[str] = [".txt", ".md", ".py", ".html", ".xml", ".log", ".rst"]
    DOCUMENT_EXTENSIONS: List[str] = [".pdf", ".docx", ".doc"]

    @staticmethod
    def read_file(file_path: Union[str, Path], **kwargs: Any) -> pd.DataFrame:
        """
        Read a file and return its content based on the specified mode.

        For unstructured documents (TXT, DOCX, PDF), the default approach is to
        convert everything to PDF and use instructor's multimodal PDF support.
        This eliminates the need for manual chunking and provides the best
        context preservation.

        Args:
            file_path: Path to the file to read
            **kwargs: Additional options for file reading including:
                - mode: Reading mode - 'multimodal_pdf' (default), 'simple_text', or 'simple_pdf'
                - use_multimodal: Use instructor's multimodal support (default: True)
                - file_options: Additional options for reading the file

        Returns:
            pandas DataFrame with the appropriate structure for the specified mode

        Raises:
            FileError: If file cannot be read or processed
        """
        # Extract parameters from kwargs
        mode = kwargs.get("mode", "multimodal_pdf")
        use_multimodal = kwargs.get("use_multimodal", True)
        file_options = kwargs.get("file_options", {})

        # Handle legacy parameter structure
        if use_multimodal and mode == "multimodal_pdf":
            mode = "multimodal_pdf"
        elif not use_multimodal:
            mode = "simple_text"
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                raise FileError(f"File not found: {file_path}")

            file_extension = file_path.suffix.lower()

            # Handle structured files (return DataFrame)
            if file_extension in FileReader.STRUCTURED_EXTENSIONS:
                read_func = FileReader.STRUCTURED_EXTENSIONS[file_extension]
                return read_func(file_path, **file_options)

            # Handle unstructured files
            if (
                file_extension in FileReader.TEXT_EXTENSIONS
                or file_extension in FileReader.DOCUMENT_EXTENSIONS
            ):
                if mode == "multimodal_pdf":
                    # Convert all unstructured documents to PDF for instructor's multimodal support
                    pdf_path = FileReader._convert_to_pdf(file_path)
                    # Return DataFrame with required structure for multimodal processing
                    return pd.DataFrame(
                        {
                            "pdf_path": [str(pdf_path)],
                            "source": [str(file_path)],
                            "multimodal": [True],
                            "file_type": ["pdf"],
                        }
                    )
                elif mode == "simple_text":
                    # Fallback: simple text reading with chunking
                    return FileReader._read_as_text_chunks(file_path, kwargs)
                elif mode == "simple_pdf":
                    # Fallback: simple PDF reading (if it's already a PDF)
                    if file_extension == ".pdf":
                        return FileReader._read_pdf_chunks(file_path, kwargs)
                    else:
                        # Convert to PDF first, then read simply
                        pdf_path = FileReader._convert_to_pdf(file_path)
                        return FileReader._read_pdf_chunks(Path(pdf_path), kwargs)

            raise FileError(f"Unsupported file type: {file_extension}")

        except Exception as e:
            raise FileError(f"Error reading file {file_path}: {str(e)}")

    @staticmethod
    def _convert_to_pdf(file_path: Path) -> str:
        """
        Convert any supported document to PDF using docling -> markdown -> PDF pipeline.

        Returns the path to the generated PDF file for use with instructor's multimodal support.
        """
        try:
            file_extension = file_path.suffix.lower()

            # If it's already a PDF, return as-is
            if file_extension == ".pdf":
                return str(file_path)

            # For simple text files, read directly
            if file_extension in FileReader.TEXT_EXTENSIONS:
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                return FileReader._markdown_to_pdf(content, file_path.stem)

            # For document files, use docling to convert to markdown first
            elif (
                file_extension in FileReader.DOCUMENT_EXTENSIONS
                and file_extension != ".pdf"
            ):
                try:
                    from docling.document_converter import DocumentConverter

                    converter = DocumentConverter()
                    result = converter.convert(str(file_path))
                    markdown_content = result.document.export_to_markdown()

                    # Convert markdown to PDF
                    return FileReader._markdown_to_pdf(markdown_content, file_path.stem)

                except ImportError:
                    raise FileError(
                        f"docling not available for {file_extension} conversion"
                    )
            else:
                raise FileError(
                    f"Unsupported file type for conversion: {file_extension}"
                )

        except Exception as e:
            raise FileError(f"Error converting {file_path} to PDF: {str(e)}")

    @staticmethod
    def _markdown_to_pdf(markdown_content: str, filename: str) -> str:
        """Convert markdown content to PDF and return the path."""

        import markdown
        import weasyprint

        # Convert markdown to HTML
        md = markdown.Markdown(extensions=["extra", "codehilite"])
        html_content = md.convert(markdown_content)

        # Add basic CSS styling
        html_with_css = f"""
            <!DOCTYPE html>
            <html>
            <head>
                <meta charset="utf-8">
                <style>
                    body {{ font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }}
                    h1, h2, h3 {{ color: #333; }}
                    pre {{ background-color: #f4f4f4; padding: 10px; border-radius: 5px; }}
                    code {{ background-color: #f4f4f4; padding: 2px 4px; border-radius: 3px; }}
                </style>
            </head>
            <body>
                {html_content}
            </body>
            </html>
            """

        # Create temporary PDF file
        with tempfile.NamedTemporaryFile(
            delete=False, suffix=".pdf", prefix=f"{filename}_"
        ) as tmp_file:
            pdf_path = tmp_file.name

        # Generate PDF with weasyprint
        weasyprint.HTML(string=html_with_css).write_pdf(pdf_path)
        return pdf_path

    @staticmethod
    def _read_as_text_chunks(file_path: Path, kwargs: Dict[str, Any]) -> pd.DataFrame:
        """Simple text reading fallback with chunking."""
        try:
            file_extension = file_path.suffix.lower()
            chunk_size = kwargs.get("chunk_size", 1000)
            chunk_overlap = kwargs.get("chunk_overlap", 200)

            if file_extension in FileReader.TEXT_EXTENSIONS:
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
            elif file_extension == ".docx":
                try:
                    from docx import Document

                    doc = Document(file_path)
                    content = "\n".join(
                        [paragraph.text for paragraph in doc.paragraphs]
                    )
                except ImportError:
                    raise FileError("python-docx not available for DOCX reading")
            elif file_extension == ".pdf":
                content = FileReader._extract_pdf_text(file_path)
            else:
                raise FileError(f"Cannot read {file_extension} as simple text")

            # Simple chunking
            chunks = []
            for i in range(0, len(content), chunk_size - chunk_overlap):
                chunks.append(content[i : i + chunk_size])

            return pd.DataFrame(
                {
                    "text": chunks,
                    "chunk_id": range(len(chunks)),
                    "source": str(file_path),
                    "processing_method": ["simple_text"] * len(chunks),
                }
            )

        except Exception as e:
            raise FileError(f"Error reading {file_path} as text: {str(e)}")

    @staticmethod
    def _read_pdf_chunks(file_path: Path, kwargs: Dict[str, Any]) -> pd.DataFrame:
        """Simple PDF text extraction fallback with chunking."""
        try:
            chunk_size = kwargs.get("chunk_size", 1000)
            chunk_overlap = kwargs.get("chunk_overlap", 200)

            content = FileReader._extract_pdf_text(file_path)

            # Simple chunking
            chunks = []
            for i in range(0, len(content), chunk_size - chunk_overlap):
                chunks.append(content[i : i + chunk_size])

            return pd.DataFrame(
                {
                    "text": chunks,
                    "chunk_id": range(len(chunks)),
                    "source": str(file_path),
                    "processing_method": ["simple_pdf"] * len(chunks),
                }
            )

        except Exception as e:
            raise FileError(f"Error reading PDF {file_path}: {str(e)}")

    @staticmethod
    def _extract_pdf_text(file_path: Path) -> str:
        """Extract text from PDF file using PyPDF2."""
        try:
            import PyPDF2

            with open(file_path, "rb") as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
                return text

        except ImportError:
            raise FileError("PyPDF2 not available for simple PDF reading")
        except Exception as e:
            raise FileError(f"Error reading PDF {file_path}: {str(e)}")

    @staticmethod
    def get_file_type(file_path: Union[str, Path]) -> str:
        """Get the type of file based on its extension"""
        file_extension = Path(file_path).suffix.lower()

        if file_extension in FileReader.STRUCTURED_EXTENSIONS:
            return "structured"
        elif file_extension in FileReader.TEXT_EXTENSIONS:
            return "text"
        elif file_extension in FileReader.DOCUMENT_EXTENSIONS:
            return "document"
        else:
            return "unknown"

get_file_type(file_path) staticmethod

Get the type of file based on its extension

Source code in structx/utils/file_reader.py
@staticmethod
def get_file_type(file_path: Union[str, Path]) -> str:
    """Get the type of file based on its extension"""
    file_extension = Path(file_path).suffix.lower()

    if file_extension in FileReader.STRUCTURED_EXTENSIONS:
        return "structured"
    elif file_extension in FileReader.TEXT_EXTENSIONS:
        return "text"
    elif file_extension in FileReader.DOCUMENT_EXTENSIONS:
        return "document"
    else:
        return "unknown"

read_file(file_path, **kwargs) staticmethod

Read a file and return its content based on the specified mode.

For unstructured documents (TXT, DOCX, PDF), the default approach is to convert everything to PDF and use instructor's multimodal PDF support. This eliminates the need for manual chunking and provides the best context preservation.

Parameters:

Name Type Description Default
file_path Union[str, Path]

Path to the file to read

required
**kwargs Any

Additional options for file reading including: - mode: Reading mode - 'multimodal_pdf' (default), 'simple_text', or 'simple_pdf' - use_multimodal: Use instructor's multimodal support (default: True) - file_options: Additional options for reading the file

{}

Returns:

Type Description
DataFrame

pandas DataFrame with the appropriate structure for the specified mode

Raises:

Type Description
FileError

If file cannot be read or processed

Source code in structx/utils/file_reader.py
@staticmethod
def read_file(file_path: Union[str, Path], **kwargs: Any) -> pd.DataFrame:
    """
    Read a file and return its content based on the specified mode.

    For unstructured documents (TXT, DOCX, PDF), the default approach is to
    convert everything to PDF and use instructor's multimodal PDF support.
    This eliminates the need for manual chunking and provides the best
    context preservation.

    Args:
        file_path: Path to the file to read
        **kwargs: Additional options for file reading including:
            - mode: Reading mode - 'multimodal_pdf' (default), 'simple_text', or 'simple_pdf'
            - use_multimodal: Use instructor's multimodal support (default: True)
            - file_options: Additional options for reading the file

    Returns:
        pandas DataFrame with the appropriate structure for the specified mode

    Raises:
        FileError: If file cannot be read or processed
    """
    # Extract parameters from kwargs
    mode = kwargs.get("mode", "multimodal_pdf")
    use_multimodal = kwargs.get("use_multimodal", True)
    file_options = kwargs.get("file_options", {})

    # Handle legacy parameter structure
    if use_multimodal and mode == "multimodal_pdf":
        mode = "multimodal_pdf"
    elif not use_multimodal:
        mode = "simple_text"
    try:
        file_path = Path(file_path)
        if not file_path.exists():
            raise FileError(f"File not found: {file_path}")

        file_extension = file_path.suffix.lower()

        # Handle structured files (return DataFrame)
        if file_extension in FileReader.STRUCTURED_EXTENSIONS:
            read_func = FileReader.STRUCTURED_EXTENSIONS[file_extension]
            return read_func(file_path, **file_options)

        # Handle unstructured files
        if (
            file_extension in FileReader.TEXT_EXTENSIONS
            or file_extension in FileReader.DOCUMENT_EXTENSIONS
        ):
            if mode == "multimodal_pdf":
                # Convert all unstructured documents to PDF for instructor's multimodal support
                pdf_path = FileReader._convert_to_pdf(file_path)
                # Return DataFrame with required structure for multimodal processing
                return pd.DataFrame(
                    {
                        "pdf_path": [str(pdf_path)],
                        "source": [str(file_path)],
                        "multimodal": [True],
                        "file_type": ["pdf"],
                    }
                )
            elif mode == "simple_text":
                # Fallback: simple text reading with chunking
                return FileReader._read_as_text_chunks(file_path, kwargs)
            elif mode == "simple_pdf":
                # Fallback: simple PDF reading (if it's already a PDF)
                if file_extension == ".pdf":
                    return FileReader._read_pdf_chunks(file_path, kwargs)
                else:
                    # Convert to PDF first, then read simply
                    pdf_path = FileReader._convert_to_pdf(file_path)
                    return FileReader._read_pdf_chunks(Path(pdf_path), kwargs)

        raise FileError(f"Unsupported file type: {file_extension}")

    except Exception as e:
        raise FileError(f"Error reading file {file_path}: {str(e)}")