py-data-analysis/.venv/lib/python3.12/site-packages/nbconvert/preprocessors/extractattachments.py

"""
Module that extracts attachments from notebooks into their own files
"""

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.

import os
from base64 import b64decode

from traitlets import Bool, Unicode

from .base import Preprocessor


class ExtractAttachmentsPreprocessor(Preprocessor):
    """
    Extracts attachments from all (markdown and raw) cells in a notebook.
    The extracted attachments are stored in a directory ('attachments' by default).
    https://nbformat.readthedocs.io/en/latest/format_description.html#cell-attachments
    """

    attachments_directory_template = Unicode(
        "{notebook_name}_attachments",
        help="Directory to place attachments if use_separate_dir is True",
    ).tag(config=True)

    use_separate_dir = Bool(
        False,
        help="Whether to use output_files_dir (which ExtractOutput also uses) or "
        "create a separate directory for attachments",
    ).tag(config=True)

    def __init__(self, **kw):
        """
        Public constructor
        """
        super().__init__(**kw)
        # directory path,
        self.path_name = ""  # will be set in self.preprocess, needs resources
        # Where extracted attachments are stored in resources
        self.resources_item_key = (
            "attachments"  # Here as a default, in case someone doesn't want to call preprocess
        )

    # Add condition and configurability here
    def preprocess(self, nb, resources):
        """
        Determine some settings and apply preprocessor to notebook
        """
        if self.use_separate_dir:
            self.path_name = self.attachments_directory_template.format(
                notebook_name=resources["unique_key"]
            )
            # Initialize resources for attachments
            resources["attachment_files_dir"] = self.path_name
            resources["attachments"] = {}
            self.resources_item_key = "attachments"
        else:
            # Use same resources as ExtractOutput
            self.path_name = resources["output_files_dir"]
            self.resources_item_key = "outputs"

        # Make sure key exists
        if not isinstance(resources[self.resources_item_key], dict):
            resources[self.resources_item_key] = {}

        nb, resources = super().preprocess(nb, resources)
        return nb, resources

    def preprocess_cell(self, cell, resources, index):
        """
        Extract attachments to individual files and
        change references to them.
        E.g.
        '![image.png](attachment:021fdd80.png)'
        becomes
        '![image.png]({path_name}/021fdd80.png)'
        Assumes self.path_name and self.resources_item_key is set properly (usually in preprocess).
        """
        if "attachments" in cell:
            for fname in cell.attachments:
                self.log.debug("Encountered attachment %s", fname)

                # Add file for writer

                # Right now I don't know of a situation where there would be multiple
                # mime types under same filename, and I can't index into it without the mimetype.
                # So I only read the first one.
                for mimetype in cell.attachments[fname]:
                    # convert to bytes and decode
                    data = cell.attachments[fname][mimetype].encode("utf-8")
                    decoded = b64decode(data)
                    break

                # FilesWriter wants path to be in attachment filename here
                new_filename = os.path.join(self.path_name, fname)
                resources[self.resources_item_key][new_filename] = decoded

                # Edit the reference to the attachment

                # os.path.join on windows uses "\\" separator,
                # but files like markdown still want "/"
                if os.path.sep != "/":
                    new_filename = new_filename.replace(os.path.sep, "/")
                cell.source = cell.source.replace("attachment:" + fname, new_filename)

        return cell, resources