You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			679 lines
		
	
	
		
			23 KiB
		
	
	
	
		
			Python
		
	
			
		
		
	
	
			679 lines
		
	
	
		
			23 KiB
		
	
	
	
		
			Python
		
	
""" parquet compat """
 | 
						|
from __future__ import annotations
 | 
						|
 | 
						|
import io
 | 
						|
import json
 | 
						|
import os
 | 
						|
from typing import (
 | 
						|
    TYPE_CHECKING,
 | 
						|
    Any,
 | 
						|
    Literal,
 | 
						|
)
 | 
						|
import warnings
 | 
						|
from warnings import (
 | 
						|
    catch_warnings,
 | 
						|
    filterwarnings,
 | 
						|
)
 | 
						|
 | 
						|
from pandas._config.config import _get_option
 | 
						|
 | 
						|
from pandas._libs import lib
 | 
						|
from pandas.compat._optional import import_optional_dependency
 | 
						|
from pandas.errors import AbstractMethodError
 | 
						|
from pandas.util._decorators import doc
 | 
						|
from pandas.util._exceptions import find_stack_level
 | 
						|
from pandas.util._validators import check_dtype_backend
 | 
						|
 | 
						|
from pandas import (
 | 
						|
    DataFrame,
 | 
						|
    get_option,
 | 
						|
)
 | 
						|
from pandas.core.shared_docs import _shared_docs
 | 
						|
 | 
						|
from pandas.io._util import arrow_table_to_pandas
 | 
						|
from pandas.io.common import (
 | 
						|
    IOHandles,
 | 
						|
    get_handle,
 | 
						|
    is_fsspec_url,
 | 
						|
    is_url,
 | 
						|
    stringify_path,
 | 
						|
)
 | 
						|
 | 
						|
if TYPE_CHECKING:
 | 
						|
    from pandas._typing import (
 | 
						|
        DtypeBackend,
 | 
						|
        FilePath,
 | 
						|
        ReadBuffer,
 | 
						|
        StorageOptions,
 | 
						|
        WriteBuffer,
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
def get_engine(engine: str) -> BaseImpl:
 | 
						|
    """return our implementation"""
 | 
						|
    if engine == "auto":
 | 
						|
        engine = get_option("io.parquet.engine")
 | 
						|
 | 
						|
    if engine == "auto":
 | 
						|
        # try engines in this order
 | 
						|
        engine_classes = [PyArrowImpl, FastParquetImpl]
 | 
						|
 | 
						|
        error_msgs = ""
 | 
						|
        for engine_class in engine_classes:
 | 
						|
            try:
 | 
						|
                return engine_class()
 | 
						|
            except ImportError as err:
 | 
						|
                error_msgs += "\n - " + str(err)
 | 
						|
 | 
						|
        raise ImportError(
 | 
						|
            "Unable to find a usable engine; "
 | 
						|
            "tried using: 'pyarrow', 'fastparquet'.\n"
 | 
						|
            "A suitable version of "
 | 
						|
            "pyarrow or fastparquet is required for parquet "
 | 
						|
            "support.\n"
 | 
						|
            "Trying to import the above resulted in these errors:"
 | 
						|
            f"{error_msgs}"
 | 
						|
        )
 | 
						|
 | 
						|
    if engine == "pyarrow":
 | 
						|
        return PyArrowImpl()
 | 
						|
    elif engine == "fastparquet":
 | 
						|
        return FastParquetImpl()
 | 
						|
 | 
						|
    raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
 | 
						|
 | 
						|
 | 
						|
def _get_path_or_handle(
 | 
						|
    path: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
 | 
						|
    fs: Any,
 | 
						|
    storage_options: StorageOptions | None = None,
 | 
						|
    mode: str = "rb",
 | 
						|
    is_dir: bool = False,
 | 
						|
) -> tuple[
 | 
						|
    FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], IOHandles[bytes] | None, Any
 | 
						|
]:
 | 
						|
    """File handling for PyArrow."""
 | 
						|
    path_or_handle = stringify_path(path)
 | 
						|
    if fs is not None:
 | 
						|
        pa_fs = import_optional_dependency("pyarrow.fs", errors="ignore")
 | 
						|
        fsspec = import_optional_dependency("fsspec", errors="ignore")
 | 
						|
        if pa_fs is not None and isinstance(fs, pa_fs.FileSystem):
 | 
						|
            if storage_options:
 | 
						|
                raise NotImplementedError(
 | 
						|
                    "storage_options not supported with a pyarrow FileSystem."
 | 
						|
                )
 | 
						|
        elif fsspec is not None and isinstance(fs, fsspec.spec.AbstractFileSystem):
 | 
						|
            pass
 | 
						|
        else:
 | 
						|
            raise ValueError(
 | 
						|
                f"filesystem must be a pyarrow or fsspec FileSystem, "
 | 
						|
                f"not a {type(fs).__name__}"
 | 
						|
            )
 | 
						|
    if is_fsspec_url(path_or_handle) and fs is None:
 | 
						|
        if storage_options is None:
 | 
						|
            pa = import_optional_dependency("pyarrow")
 | 
						|
            pa_fs = import_optional_dependency("pyarrow.fs")
 | 
						|
 | 
						|
            try:
 | 
						|
                fs, path_or_handle = pa_fs.FileSystem.from_uri(path)
 | 
						|
            except (TypeError, pa.ArrowInvalid):
 | 
						|
                pass
 | 
						|
        if fs is None:
 | 
						|
            fsspec = import_optional_dependency("fsspec")
 | 
						|
            fs, path_or_handle = fsspec.core.url_to_fs(
 | 
						|
                path_or_handle, **(storage_options or {})
 | 
						|
            )
 | 
						|
    elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
 | 
						|
        # can't write to a remote url
 | 
						|
        # without making use of fsspec at the moment
 | 
						|
        raise ValueError("storage_options passed with buffer, or non-supported URL")
 | 
						|
 | 
						|
    handles = None
 | 
						|
    if (
 | 
						|
        not fs
 | 
						|
        and not is_dir
 | 
						|
        and isinstance(path_or_handle, str)
 | 
						|
        and not os.path.isdir(path_or_handle)
 | 
						|
    ):
 | 
						|
        # use get_handle only when we are very certain that it is not a directory
 | 
						|
        # fsspec resources can also point to directories
 | 
						|
        # this branch is used for example when reading from non-fsspec URLs
 | 
						|
        handles = get_handle(
 | 
						|
            path_or_handle, mode, is_text=False, storage_options=storage_options
 | 
						|
        )
 | 
						|
        fs = None
 | 
						|
        path_or_handle = handles.handle
 | 
						|
    return path_or_handle, handles, fs
 | 
						|
 | 
						|
 | 
						|
class BaseImpl:
 | 
						|
    @staticmethod
 | 
						|
    def validate_dataframe(df: DataFrame) -> None:
 | 
						|
        if not isinstance(df, DataFrame):
 | 
						|
            raise ValueError("to_parquet only supports IO with DataFrames")
 | 
						|
 | 
						|
    def write(self, df: DataFrame, path, compression, **kwargs):
 | 
						|
        raise AbstractMethodError(self)
 | 
						|
 | 
						|
    def read(self, path, columns=None, **kwargs) -> DataFrame:
 | 
						|
        raise AbstractMethodError(self)
 | 
						|
 | 
						|
 | 
						|
class PyArrowImpl(BaseImpl):
 | 
						|
    def __init__(self) -> None:
 | 
						|
        import_optional_dependency(
 | 
						|
            "pyarrow", extra="pyarrow is required for parquet support."
 | 
						|
        )
 | 
						|
        import pyarrow.parquet
 | 
						|
 | 
						|
        # import utils to register the pyarrow extension types
 | 
						|
        import pandas.core.arrays.arrow.extension_types  # pyright: ignore[reportUnusedImport] # noqa: F401
 | 
						|
 | 
						|
        self.api = pyarrow
 | 
						|
 | 
						|
    def write(
 | 
						|
        self,
 | 
						|
        df: DataFrame,
 | 
						|
        path: FilePath | WriteBuffer[bytes],
 | 
						|
        compression: str | None = "snappy",
 | 
						|
        index: bool | None = None,
 | 
						|
        storage_options: StorageOptions | None = None,
 | 
						|
        partition_cols: list[str] | None = None,
 | 
						|
        filesystem=None,
 | 
						|
        **kwargs,
 | 
						|
    ) -> None:
 | 
						|
        self.validate_dataframe(df)
 | 
						|
 | 
						|
        from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)}
 | 
						|
        if index is not None:
 | 
						|
            from_pandas_kwargs["preserve_index"] = index
 | 
						|
 | 
						|
        table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
 | 
						|
 | 
						|
        if df.attrs:
 | 
						|
            df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)}
 | 
						|
            existing_metadata = table.schema.metadata
 | 
						|
            merged_metadata = {**existing_metadata, **df_metadata}
 | 
						|
            table = table.replace_schema_metadata(merged_metadata)
 | 
						|
 | 
						|
        path_or_handle, handles, filesystem = _get_path_or_handle(
 | 
						|
            path,
 | 
						|
            filesystem,
 | 
						|
            storage_options=storage_options,
 | 
						|
            mode="wb",
 | 
						|
            is_dir=partition_cols is not None,
 | 
						|
        )
 | 
						|
        if (
 | 
						|
            isinstance(path_or_handle, io.BufferedWriter)
 | 
						|
            and hasattr(path_or_handle, "name")
 | 
						|
            and isinstance(path_or_handle.name, (str, bytes))
 | 
						|
        ):
 | 
						|
            if isinstance(path_or_handle.name, bytes):
 | 
						|
                path_or_handle = path_or_handle.name.decode()
 | 
						|
            else:
 | 
						|
                path_or_handle = path_or_handle.name
 | 
						|
 | 
						|
        try:
 | 
						|
            if partition_cols is not None:
 | 
						|
                # writes to multiple files under the given path
 | 
						|
                self.api.parquet.write_to_dataset(
 | 
						|
                    table,
 | 
						|
                    path_or_handle,
 | 
						|
                    compression=compression,
 | 
						|
                    partition_cols=partition_cols,
 | 
						|
                    filesystem=filesystem,
 | 
						|
                    **kwargs,
 | 
						|
                )
 | 
						|
            else:
 | 
						|
                # write to single output file
 | 
						|
                self.api.parquet.write_table(
 | 
						|
                    table,
 | 
						|
                    path_or_handle,
 | 
						|
                    compression=compression,
 | 
						|
                    filesystem=filesystem,
 | 
						|
                    **kwargs,
 | 
						|
                )
 | 
						|
        finally:
 | 
						|
            if handles is not None:
 | 
						|
                handles.close()
 | 
						|
 | 
						|
    def read(
 | 
						|
        self,
 | 
						|
        path,
 | 
						|
        columns=None,
 | 
						|
        filters=None,
 | 
						|
        use_nullable_dtypes: bool = False,
 | 
						|
        dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
 | 
						|
        storage_options: StorageOptions | None = None,
 | 
						|
        filesystem=None,
 | 
						|
        **kwargs,
 | 
						|
    ) -> DataFrame:
 | 
						|
        kwargs["use_pandas_metadata"] = True
 | 
						|
 | 
						|
        to_pandas_kwargs = {}
 | 
						|
 | 
						|
        manager = _get_option("mode.data_manager", silent=True)
 | 
						|
        if manager == "array":
 | 
						|
            to_pandas_kwargs["split_blocks"] = True
 | 
						|
        path_or_handle, handles, filesystem = _get_path_or_handle(
 | 
						|
            path,
 | 
						|
            filesystem,
 | 
						|
            storage_options=storage_options,
 | 
						|
            mode="rb",
 | 
						|
        )
 | 
						|
        try:
 | 
						|
            pa_table = self.api.parquet.read_table(
 | 
						|
                path_or_handle,
 | 
						|
                columns=columns,
 | 
						|
                filesystem=filesystem,
 | 
						|
                filters=filters,
 | 
						|
                **kwargs,
 | 
						|
            )
 | 
						|
 | 
						|
            with catch_warnings():
 | 
						|
                filterwarnings(
 | 
						|
                    "ignore",
 | 
						|
                    "make_block is deprecated",
 | 
						|
                    DeprecationWarning,
 | 
						|
                )
 | 
						|
                result = arrow_table_to_pandas(
 | 
						|
                    pa_table,
 | 
						|
                    dtype_backend=dtype_backend,
 | 
						|
                    to_pandas_kwargs=to_pandas_kwargs,
 | 
						|
                )
 | 
						|
 | 
						|
            if manager == "array":
 | 
						|
                result = result._as_manager("array", copy=False)
 | 
						|
 | 
						|
            if pa_table.schema.metadata:
 | 
						|
                if b"PANDAS_ATTRS" in pa_table.schema.metadata:
 | 
						|
                    df_metadata = pa_table.schema.metadata[b"PANDAS_ATTRS"]
 | 
						|
                    result.attrs = json.loads(df_metadata)
 | 
						|
            return result
 | 
						|
        finally:
 | 
						|
            if handles is not None:
 | 
						|
                handles.close()
 | 
						|
 | 
						|
 | 
						|
class FastParquetImpl(BaseImpl):
 | 
						|
    def __init__(self) -> None:
 | 
						|
        # since pandas is a dependency of fastparquet
 | 
						|
        # we need to import on first use
 | 
						|
        fastparquet = import_optional_dependency(
 | 
						|
            "fastparquet", extra="fastparquet is required for parquet support."
 | 
						|
        )
 | 
						|
        self.api = fastparquet
 | 
						|
 | 
						|
    def write(
 | 
						|
        self,
 | 
						|
        df: DataFrame,
 | 
						|
        path,
 | 
						|
        compression: Literal["snappy", "gzip", "brotli"] | None = "snappy",
 | 
						|
        index=None,
 | 
						|
        partition_cols=None,
 | 
						|
        storage_options: StorageOptions | None = None,
 | 
						|
        filesystem=None,
 | 
						|
        **kwargs,
 | 
						|
    ) -> None:
 | 
						|
        self.validate_dataframe(df)
 | 
						|
 | 
						|
        if "partition_on" in kwargs and partition_cols is not None:
 | 
						|
            raise ValueError(
 | 
						|
                "Cannot use both partition_on and "
 | 
						|
                "partition_cols. Use partition_cols for partitioning data"
 | 
						|
            )
 | 
						|
        if "partition_on" in kwargs:
 | 
						|
            partition_cols = kwargs.pop("partition_on")
 | 
						|
 | 
						|
        if partition_cols is not None:
 | 
						|
            kwargs["file_scheme"] = "hive"
 | 
						|
 | 
						|
        if filesystem is not None:
 | 
						|
            raise NotImplementedError(
 | 
						|
                "filesystem is not implemented for the fastparquet engine."
 | 
						|
            )
 | 
						|
 | 
						|
        # cannot use get_handle as write() does not accept file buffers
 | 
						|
        path = stringify_path(path)
 | 
						|
        if is_fsspec_url(path):
 | 
						|
            fsspec = import_optional_dependency("fsspec")
 | 
						|
 | 
						|
            # if filesystem is provided by fsspec, file must be opened in 'wb' mode.
 | 
						|
            kwargs["open_with"] = lambda path, _: fsspec.open(
 | 
						|
                path, "wb", **(storage_options or {})
 | 
						|
            ).open()
 | 
						|
        elif storage_options:
 | 
						|
            raise ValueError(
 | 
						|
                "storage_options passed with file object or non-fsspec file path"
 | 
						|
            )
 | 
						|
 | 
						|
        with catch_warnings(record=True):
 | 
						|
            self.api.write(
 | 
						|
                path,
 | 
						|
                df,
 | 
						|
                compression=compression,
 | 
						|
                write_index=index,
 | 
						|
                partition_on=partition_cols,
 | 
						|
                **kwargs,
 | 
						|
            )
 | 
						|
 | 
						|
    def read(
 | 
						|
        self,
 | 
						|
        path,
 | 
						|
        columns=None,
 | 
						|
        filters=None,
 | 
						|
        storage_options: StorageOptions | None = None,
 | 
						|
        filesystem=None,
 | 
						|
        **kwargs,
 | 
						|
    ) -> DataFrame:
 | 
						|
        parquet_kwargs: dict[str, Any] = {}
 | 
						|
        use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
 | 
						|
        dtype_backend = kwargs.pop("dtype_backend", lib.no_default)
 | 
						|
        # We are disabling nullable dtypes for fastparquet pending discussion
 | 
						|
        parquet_kwargs["pandas_nulls"] = False
 | 
						|
        if use_nullable_dtypes:
 | 
						|
            raise ValueError(
 | 
						|
                "The 'use_nullable_dtypes' argument is not supported for the "
 | 
						|
                "fastparquet engine"
 | 
						|
            )
 | 
						|
        if dtype_backend is not lib.no_default:
 | 
						|
            raise ValueError(
 | 
						|
                "The 'dtype_backend' argument is not supported for the "
 | 
						|
                "fastparquet engine"
 | 
						|
            )
 | 
						|
        if filesystem is not None:
 | 
						|
            raise NotImplementedError(
 | 
						|
                "filesystem is not implemented for the fastparquet engine."
 | 
						|
            )
 | 
						|
        path = stringify_path(path)
 | 
						|
        handles = None
 | 
						|
        if is_fsspec_url(path):
 | 
						|
            fsspec = import_optional_dependency("fsspec")
 | 
						|
 | 
						|
            parquet_kwargs["fs"] = fsspec.open(path, "rb", **(storage_options or {})).fs
 | 
						|
        elif isinstance(path, str) and not os.path.isdir(path):
 | 
						|
            # use get_handle only when we are very certain that it is not a directory
 | 
						|
            # fsspec resources can also point to directories
 | 
						|
            # this branch is used for example when reading from non-fsspec URLs
 | 
						|
            handles = get_handle(
 | 
						|
                path, "rb", is_text=False, storage_options=storage_options
 | 
						|
            )
 | 
						|
            path = handles.handle
 | 
						|
 | 
						|
        try:
 | 
						|
            parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
 | 
						|
            return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs)
 | 
						|
        finally:
 | 
						|
            if handles is not None:
 | 
						|
                handles.close()
 | 
						|
 | 
						|
 | 
						|
@doc(storage_options=_shared_docs["storage_options"])
 | 
						|
def to_parquet(
 | 
						|
    df: DataFrame,
 | 
						|
    path: FilePath | WriteBuffer[bytes] | None = None,
 | 
						|
    engine: str = "auto",
 | 
						|
    compression: str | None = "snappy",
 | 
						|
    index: bool | None = None,
 | 
						|
    storage_options: StorageOptions | None = None,
 | 
						|
    partition_cols: list[str] | None = None,
 | 
						|
    filesystem: Any = None,
 | 
						|
    **kwargs,
 | 
						|
) -> bytes | None:
 | 
						|
    """
 | 
						|
    Write a DataFrame to the parquet format.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    df : DataFrame
 | 
						|
    path : str, path object, file-like object, or None, default None
 | 
						|
        String, path object (implementing ``os.PathLike[str]``), or file-like
 | 
						|
        object implementing a binary ``write()`` function. If None, the result is
 | 
						|
        returned as bytes. If a string, it will be used as Root Directory path
 | 
						|
        when writing a partitioned dataset. The engine fastparquet does not
 | 
						|
        accept file-like objects.
 | 
						|
    engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
 | 
						|
        Parquet library to use. If 'auto', then the option
 | 
						|
        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
 | 
						|
        behavior is to try 'pyarrow', falling back to 'fastparquet' if
 | 
						|
        'pyarrow' is unavailable.
 | 
						|
 | 
						|
        When using the ``'pyarrow'`` engine and no storage options are provided
 | 
						|
        and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
 | 
						|
        (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
 | 
						|
        Use the filesystem keyword with an instantiated fsspec filesystem
 | 
						|
        if you wish to use its implementation.
 | 
						|
    compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}},
 | 
						|
        default 'snappy'. Name of the compression to use. Use ``None``
 | 
						|
        for no compression.
 | 
						|
    index : bool, default None
 | 
						|
        If ``True``, include the dataframe's index(es) in the file output. If
 | 
						|
        ``False``, they will not be written to the file.
 | 
						|
        If ``None``, similar to ``True`` the dataframe's index(es)
 | 
						|
        will be saved. However, instead of being saved as values,
 | 
						|
        the RangeIndex will be stored as a range in the metadata so it
 | 
						|
        doesn't require much space and is faster. Other indexes will
 | 
						|
        be included as columns in the file output.
 | 
						|
    partition_cols : str or list, optional, default None
 | 
						|
        Column names by which to partition the dataset.
 | 
						|
        Columns are partitioned in the order they are given.
 | 
						|
        Must be None if path is not a string.
 | 
						|
    {storage_options}
 | 
						|
 | 
						|
    filesystem : fsspec or pyarrow filesystem, default None
 | 
						|
        Filesystem object to use when reading the parquet file. Only implemented
 | 
						|
        for ``engine="pyarrow"``.
 | 
						|
 | 
						|
        .. versionadded:: 2.1.0
 | 
						|
 | 
						|
    kwargs
 | 
						|
        Additional keyword arguments passed to the engine
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    bytes if no path argument is provided else None
 | 
						|
    """
 | 
						|
    if isinstance(partition_cols, str):
 | 
						|
        partition_cols = [partition_cols]
 | 
						|
    impl = get_engine(engine)
 | 
						|
 | 
						|
    path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
 | 
						|
 | 
						|
    impl.write(
 | 
						|
        df,
 | 
						|
        path_or_buf,
 | 
						|
        compression=compression,
 | 
						|
        index=index,
 | 
						|
        partition_cols=partition_cols,
 | 
						|
        storage_options=storage_options,
 | 
						|
        filesystem=filesystem,
 | 
						|
        **kwargs,
 | 
						|
    )
 | 
						|
 | 
						|
    if path is None:
 | 
						|
        assert isinstance(path_or_buf, io.BytesIO)
 | 
						|
        return path_or_buf.getvalue()
 | 
						|
    else:
 | 
						|
        return None
 | 
						|
 | 
						|
 | 
						|
@doc(storage_options=_shared_docs["storage_options"])
 | 
						|
def read_parquet(
 | 
						|
    path: FilePath | ReadBuffer[bytes],
 | 
						|
    engine: str = "auto",
 | 
						|
    columns: list[str] | None = None,
 | 
						|
    storage_options: StorageOptions | None = None,
 | 
						|
    use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
 | 
						|
    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
 | 
						|
    filesystem: Any = None,
 | 
						|
    filters: list[tuple] | list[list[tuple]] | None = None,
 | 
						|
    **kwargs,
 | 
						|
) -> DataFrame:
 | 
						|
    """
 | 
						|
    Load a parquet object from the file path, returning a DataFrame.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    path : str, path object or file-like object
 | 
						|
        String, path object (implementing ``os.PathLike[str]``), or file-like
 | 
						|
        object implementing a binary ``read()`` function.
 | 
						|
        The string could be a URL. Valid URL schemes include http, ftp, s3,
 | 
						|
        gs, and file. For file URLs, a host is expected. A local file could be:
 | 
						|
        ``file://localhost/path/to/table.parquet``.
 | 
						|
        A file URL can also be a path to a directory that contains multiple
 | 
						|
        partitioned parquet files. Both pyarrow and fastparquet support
 | 
						|
        paths to directories as well as file URLs. A directory path could be:
 | 
						|
        ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
 | 
						|
    engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
 | 
						|
        Parquet library to use. If 'auto', then the option
 | 
						|
        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
 | 
						|
        behavior is to try 'pyarrow', falling back to 'fastparquet' if
 | 
						|
        'pyarrow' is unavailable.
 | 
						|
 | 
						|
        When using the ``'pyarrow'`` engine and no storage options are provided
 | 
						|
        and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
 | 
						|
        (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
 | 
						|
        Use the filesystem keyword with an instantiated fsspec filesystem
 | 
						|
        if you wish to use its implementation.
 | 
						|
    columns : list, default=None
 | 
						|
        If not None, only these columns will be read from the file.
 | 
						|
    {storage_options}
 | 
						|
 | 
						|
        .. versionadded:: 1.3.0
 | 
						|
 | 
						|
    use_nullable_dtypes : bool, default False
 | 
						|
        If True, use dtypes that use ``pd.NA`` as missing value indicator
 | 
						|
        for the resulting DataFrame. (only applicable for the ``pyarrow``
 | 
						|
        engine)
 | 
						|
        As new dtypes are added that support ``pd.NA`` in the future, the
 | 
						|
        output with this option will change to use those dtypes.
 | 
						|
        Note: this is an experimental option, and behaviour (e.g. additional
 | 
						|
        support dtypes) may change without notice.
 | 
						|
 | 
						|
        .. deprecated:: 2.0
 | 
						|
 | 
						|
    dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
 | 
						|
        Back-end data type applied to the resultant :class:`DataFrame`
 | 
						|
        (still experimental). Behaviour is as follows:
 | 
						|
 | 
						|
        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
 | 
						|
          (default).
 | 
						|
        * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
 | 
						|
          DataFrame.
 | 
						|
 | 
						|
        .. versionadded:: 2.0
 | 
						|
 | 
						|
    filesystem : fsspec or pyarrow filesystem, default None
 | 
						|
        Filesystem object to use when reading the parquet file. Only implemented
 | 
						|
        for ``engine="pyarrow"``.
 | 
						|
 | 
						|
        .. versionadded:: 2.1.0
 | 
						|
 | 
						|
    filters : List[Tuple] or List[List[Tuple]], default None
 | 
						|
        To filter out data.
 | 
						|
        Filter syntax: [[(column, op, val), ...],...]
 | 
						|
        where op is [==, =, >, >=, <, <=, !=, in, not in]
 | 
						|
        The innermost tuples are transposed into a set of filters applied
 | 
						|
        through an `AND` operation.
 | 
						|
        The outer list combines these sets of filters through an `OR`
 | 
						|
        operation.
 | 
						|
        A single list of tuples can also be used, meaning that no `OR`
 | 
						|
        operation between set of filters is to be conducted.
 | 
						|
 | 
						|
        Using this argument will NOT result in row-wise filtering of the final
 | 
						|
        partitions unless ``engine="pyarrow"`` is also specified.  For
 | 
						|
        other engines, filtering is only performed at the partition level, that is,
 | 
						|
        to prevent the loading of some row-groups and/or files.
 | 
						|
 | 
						|
        .. versionadded:: 2.1.0
 | 
						|
 | 
						|
    **kwargs
 | 
						|
        Any additional kwargs are passed to the engine.
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    DataFrame
 | 
						|
 | 
						|
    See Also
 | 
						|
    --------
 | 
						|
    DataFrame.to_parquet : Create a parquet object that serializes a DataFrame.
 | 
						|
 | 
						|
    Examples
 | 
						|
    --------
 | 
						|
    >>> original_df = pd.DataFrame(
 | 
						|
    ...     {{"foo": range(5), "bar": range(5, 10)}}
 | 
						|
    ...    )
 | 
						|
    >>> original_df
 | 
						|
       foo  bar
 | 
						|
    0    0    5
 | 
						|
    1    1    6
 | 
						|
    2    2    7
 | 
						|
    3    3    8
 | 
						|
    4    4    9
 | 
						|
    >>> df_parquet_bytes = original_df.to_parquet()
 | 
						|
    >>> from io import BytesIO
 | 
						|
    >>> restored_df = pd.read_parquet(BytesIO(df_parquet_bytes))
 | 
						|
    >>> restored_df
 | 
						|
       foo  bar
 | 
						|
    0    0    5
 | 
						|
    1    1    6
 | 
						|
    2    2    7
 | 
						|
    3    3    8
 | 
						|
    4    4    9
 | 
						|
    >>> restored_df.equals(original_df)
 | 
						|
    True
 | 
						|
    >>> restored_bar = pd.read_parquet(BytesIO(df_parquet_bytes), columns=["bar"])
 | 
						|
    >>> restored_bar
 | 
						|
        bar
 | 
						|
    0    5
 | 
						|
    1    6
 | 
						|
    2    7
 | 
						|
    3    8
 | 
						|
    4    9
 | 
						|
    >>> restored_bar.equals(original_df[['bar']])
 | 
						|
    True
 | 
						|
 | 
						|
    The function uses `kwargs` that are passed directly to the engine.
 | 
						|
    In the following example, we use the `filters` argument of the pyarrow
 | 
						|
    engine to filter the rows of the DataFrame.
 | 
						|
 | 
						|
    Since `pyarrow` is the default engine, we can omit the `engine` argument.
 | 
						|
    Note that the `filters` argument is implemented by the `pyarrow` engine,
 | 
						|
    which can benefit from multithreading and also potentially be more
 | 
						|
    economical in terms of memory.
 | 
						|
 | 
						|
    >>> sel = [("foo", ">", 2)]
 | 
						|
    >>> restored_part = pd.read_parquet(BytesIO(df_parquet_bytes), filters=sel)
 | 
						|
    >>> restored_part
 | 
						|
        foo  bar
 | 
						|
    0    3    8
 | 
						|
    1    4    9
 | 
						|
    """
 | 
						|
 | 
						|
    impl = get_engine(engine)
 | 
						|
 | 
						|
    if use_nullable_dtypes is not lib.no_default:
 | 
						|
        msg = (
 | 
						|
            "The argument 'use_nullable_dtypes' is deprecated and will be removed "
 | 
						|
            "in a future version."
 | 
						|
        )
 | 
						|
        if use_nullable_dtypes is True:
 | 
						|
            msg += (
 | 
						|
                "Use dtype_backend='numpy_nullable' instead of use_nullable_dtype=True."
 | 
						|
            )
 | 
						|
        warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
 | 
						|
    else:
 | 
						|
        use_nullable_dtypes = False
 | 
						|
    check_dtype_backend(dtype_backend)
 | 
						|
 | 
						|
    return impl.read(
 | 
						|
        path,
 | 
						|
        columns=columns,
 | 
						|
        filters=filters,
 | 
						|
        storage_options=storage_options,
 | 
						|
        use_nullable_dtypes=use_nullable_dtypes,
 | 
						|
        dtype_backend=dtype_backend,
 | 
						|
        filesystem=filesystem,
 | 
						|
        **kwargs,
 | 
						|
    )
 |