Source code for pandas.core.frame

"""
DataFrame
---------
An efficient 2D container for potentially mixed-type time series or other
labeled data series.

Similar to its R counterpart, data.frame, except providing automatic data
alignment and a host of useful data manipulation methods having to do with the
labeling information
"""
from __future__ import annotations

import collections
from collections import abc
import datetime
from io import StringIO
import itertools
import mmap
from textwrap import dedent
from typing import (
    IO,
    TYPE_CHECKING,
    Any,
    AnyStr,
    Dict,
    FrozenSet,
    Hashable,
    Iterable,
    Iterator,
    List,
    Optional,
    Sequence,
    Set,
    Tuple,
    Type,
    Union,
    cast,
    overload,
)
import warnings

import numpy as np
import numpy.ma as ma

from pandas._config import get_option

from pandas._libs import algos as libalgos, lib, properties
from pandas._libs.lib import no_default
from pandas._typing import (
    AggFuncType,
    ArrayLike,
    Axes,
    Axis,
    CompressionOptions,
    Dtype,
    FilePathOrBuffer,
    FrameOrSeriesUnion,
    IndexKeyFunc,
    Label,
    Level,
    Renamer,
    StorageOptions,
    ValueKeyFunc,
)
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (
    Appender,
    Substitution,
    deprecate_kwarg,
    doc,
    rewrite_axis_style_signature,
)
from pandas.util._validators import (
    validate_axis_style_args,
    validate_bool_kwarg,
    validate_percentile,
)

from pandas.core.dtypes.cast import (
    construct_1d_arraylike_from_scalar,
    find_common_type,
    infer_dtype_from_scalar,
    invalidate_string_dtypes,
    maybe_box_datetimelike,
    maybe_cast_to_datetime,
    maybe_casted_values,
    maybe_convert_platform,
    maybe_downcast_to_dtype,
    maybe_infer_to_datetimelike,
    maybe_upcast,
    validate_numeric_casting,
)
from pandas.core.dtypes.common import (
    ensure_int64,
    ensure_platform_int,
    infer_dtype_from_object,
    is_bool_dtype,
    is_dataclass,
    is_datetime64_any_dtype,
    is_dict_like,
    is_dtype_equal,
    is_extension_array_dtype,
    is_float,
    is_float_dtype,
    is_hashable,
    is_integer,
    is_integer_dtype,
    is_iterator,
    is_list_like,
    is_named_tuple,
    is_object_dtype,
    is_scalar,
    is_sequence,
    pandas_dtype,
)
from pandas.core.dtypes.missing import isna, notna

from pandas.core import algorithms, common as com, generic, nanops, ops
from pandas.core.accessor import CachedAccessor
from pandas.core.aggregation import (
    aggregate,
    reconstruct_func,
    relabel_result,
    transform,
)
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.construction import extract_array
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import (
    DatetimeIndex,
    Index,
    PeriodIndex,
    ensure_index,
    ensure_index_from_sequences,
)
from pandas.core.indexes.multi import MultiIndex, maybe_droplevels
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
from pandas.core.internals import BlockManager
from pandas.core.internals.construction import (
    arrays_to_mgr,
    dataclasses_to_dicts,
    get_names_from_index,
    init_dict,
    init_ndarray,
    masked_rec_array_to_mgr,
    reorder_arrays,
    sanitize_index,
    to_arrays,
)
from pandas.core.reshape.melt import melt
from pandas.core.series import Series
from pandas.core.sorting import get_group_index, lexsort_indexer, nargsort

from pandas.io.common import get_handle
from pandas.io.formats import console, format as fmt
from pandas.io.formats.info import BaseInfo, DataFrameInfo
import pandas.plotting

if TYPE_CHECKING:
    from typing import Literal

    from pandas.core.groupby.generic import DataFrameGroupBy

    from pandas.io.formats.style import Styler

# ---------------------------------------------------------------------
# Docstring templates

_shared_doc_kwargs = {
    "axes": "index, columns",
    "klass": "DataFrame",
    "axes_single_arg": "{0 or 'index', 1 or 'columns'}",
    "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
        If 0 or 'index': apply function to each column.
        If 1 or 'columns': apply function to each row.""",
    "optional_by": """
        by : str or list of str
            Name or list of names to sort by.

            - if `axis` is 0 or `'index'` then `by` may contain index
              levels and/or column labels.
            - if `axis` is 1 or `'columns'` then `by` may contain column
              levels and/or index labels.""",
    "optional_labels": """labels : array-like, optional
            New labels / index to conform the axis specified by 'axis' to.""",
    "optional_axis": """axis : int or str, optional
            Axis to target. Can be either the axis name ('index', 'columns')
            or number (0, 1).""",
}

_numeric_only_doc = """numeric_only : boolean, default None
    Include only float, int, boolean data. If None, will attempt to use
    everything, then use only numeric data
"""

_merge_doc = """
Merge DataFrame or named Series objects with a database-style join.

The join is done on columns or indexes. If joining columns on
columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
on indexes or indexes on a column or columns, the index will be passed on.
When performing a cross merge, no column specifications to merge on are
allowed.

Parameters
----------%s
right : DataFrame or named Series
    Object to merge with.
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
    Type of merge to be performed.

    * left: use only keys from left frame, similar to a SQL left outer join;
      preserve key order.
    * right: use only keys from right frame, similar to a SQL right outer join;
      preserve key order.
    * outer: use union of keys from both frames, similar to a SQL full outer
      join; sort keys lexicographically.
    * inner: use intersection of keys from both frames, similar to a SQL inner
      join; preserve the order of the left keys.
    * cross: creates the cartesian product from both frames, preserves the order
      of the left keys.

      .. versionadded:: 1.2.0

on : label or list
    Column or index level names to join on. These must be found in both
    DataFrames. If `on` is None and not merging on indexes then this defaults
    to the intersection of the columns in both DataFrames.
left_on : label or list, or array-like
    Column or index level names to join on in the left DataFrame. Can also
    be an array or list of arrays of the length of the left DataFrame.
    These arrays are treated as if they are columns.
right_on : label or list, or array-like
    Column or index level names to join on in the right DataFrame. Can also
    be an array or list of arrays of the length of the right DataFrame.
    These arrays are treated as if they are columns.
left_index : bool, default False
    Use the index from the left DataFrame as the join key(s). If it is a
    MultiIndex, the number of keys in the other DataFrame (either the index
    or a number of columns) must match the number of levels.
right_index : bool, default False
    Use the index from the right DataFrame as the join key. Same caveats as
    left_index.
sort : bool, default False
    Sort the join keys lexicographically in the result DataFrame. If False,
    the order of the join keys depends on the join type (how keyword).
suffixes : list-like, default is ("_x", "_y")
    A length-2 sequence where each element is optionally a string
    indicating the suffix to add to overlapping column names in
    `left` and `right` respectively. Pass a value of `None` instead
    of a string to indicate that the column name from `left` or
    `right` should be left as-is, with no suffix. At least one of the
    values must not be None.
copy : bool, default True
    If False, avoid copy if possible.
indicator : bool or str, default False
    If True, adds a column to the output DataFrame called "_merge" with
    information on the source of each row. The column can be given a different
    name by providing a string argument. The column will have a Categorical
    type with the value of "left_only" for observations whose merge key only
    appears in the left DataFrame, "right_only" for observations
    whose merge key only appears in the right DataFrame, and "both"
    if the observation's merge key is found in both DataFrames.

validate : str, optional
    If specified, checks if merge is of specified type.

    * "one_to_one" or "1:1": check if merge keys are unique in both
      left and right datasets.
    * "one_to_many" or "1:m": check if merge keys are unique in left
      dataset.
    * "many_to_one" or "m:1": check if merge keys are unique in right
      dataset.
    * "many_to_many" or "m:m": allowed, but does not result in checks.

Returns
-------
DataFrame
    A DataFrame of the two merged objects.

See Also
--------
merge_ordered : Merge with optional filling/interpolation.
merge_asof : Merge on nearest keys.
DataFrame.join : Similar method using indices.

Notes
-----
Support for specifying index levels as the `on`, `left_on`, and
`right_on` parameters was added in version 0.23.0
Support for merging named Series objects was added in version 0.24.0

Examples
--------
>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
...                     'value': [1, 2, 3, 5]})
>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
...                     'value': [5, 6, 7, 8]})
>>> df1
    lkey value
0   foo      1
1   bar      2
2   baz      3
3   foo      5
>>> df2
    rkey value
0   foo      5
1   bar      6
2   baz      7
3   foo      8

Merge df1 and df2 on the lkey and rkey columns. The value columns have
the default suffixes, _x and _y, appended.

>>> df1.merge(df2, left_on='lkey', right_on='rkey')
  lkey  value_x rkey  value_y
0  foo        1  foo        5
1  foo        1  foo        8
2  foo        5  foo        5
3  foo        5  foo        8
4  bar        2  bar        6
5  baz        3  baz        7

Merge DataFrames df1 and df2 with specified left and right suffixes
appended to any overlapping columns.

>>> df1.merge(df2, left_on='lkey', right_on='rkey',
...           suffixes=('_left', '_right'))
  lkey  value_left rkey  value_right
0  foo           1  foo            5
1  foo           1  foo            8
2  foo           5  foo            5
3  foo           5  foo            8
4  bar           2  bar            6
5  baz           3  baz            7

Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
any overlapping columns.

>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
Traceback (most recent call last):
...
ValueError: columns overlap but no suffix specified:
    Index(['value'], dtype='object')

>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
>>> df1
      a  b
0   foo  1
1   bar  2
>>> df2
      a  c
0   foo  3
1   baz  4

>>> df1.merge(df2, how='inner', on='a')
      a  b  c
0   foo  1  3

>>> df1.merge(df2, how='left', on='a')
      a  b  c
0   foo  1  3.0
1   bar  2  NaN

>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
>>> df2 = pd.DataFrame({'right': [7, 8]})
>>> df1
    left
0   foo
1   bar
>>> df2
    right
0   7
1   8

>>> df1.merge(df2, how='cross')
   left  right
0   foo      7
1   foo      8
2   bar      7
3   bar      8
"""


# -----------------------------------------------------------------------
# DataFrame class


[docs]class DataFrame(NDFrame, OpsMixin): """ Two-dimensional, size-mutable, potentially heterogeneous tabular data. Data structure also contains labeled axes (rows and columns). Arithmetic operations align on both row and column labels. Can be thought of as a dict-like container for Series objects. The primary pandas data structure. Parameters ---------- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame Dict can contain Series, arrays, constants, dataclass or list-like objects. If data is a dict, column order follows insertion-order. .. versionchanged:: 0.25.0 If data is a list of dicts, column order follows insertion-order. index : Index or array-like Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and no index provided. columns : Index or array-like Column labels to use for resulting frame. Will default to RangeIndex (0, 1, 2, ..., n) if no column labels are provided. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. copy : bool, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input. See Also -------- DataFrame.from_records : Constructor from tuples, also record arrays. DataFrame.from_dict : From dicts of Series, arrays, or dicts. read_csv : Read a comma-separated values (csv) file into DataFrame. read_table : Read general delimited file into DataFrame. read_clipboard : Read text from clipboard into DataFrame. Examples -------- Constructing DataFrame from a dictionary. >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> df = pd.DataFrame(data=d) >>> df col1 col2 0 1 3 1 2 4 Notice that the inferred dtype is int64. >>> df.dtypes col1 int64 col2 int64 dtype: object To enforce a single dtype: >>> df = pd.DataFrame(data=d, dtype=np.int8) >>> df.dtypes col1 int8 col2 int8 dtype: object Constructing DataFrame from numpy ndarray: >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), ... columns=['a', 'b', 'c']) >>> df2 a b c 0 1 2 3 1 4 5 6 2 7 8 9 Constructing DataFrame from dataclass: >>> from dataclasses import make_dataclass >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) x y 0 0 0 1 0 3 2 2 3 """ _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set _typ = "dataframe" _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) @property def _constructor(self) -> Type[DataFrame]: return DataFrame _constructor_sliced: Type[Series] = Series _hidden_attrs: FrozenSet[str] = NDFrame._hidden_attrs | frozenset([]) _accessors: Set[str] = {"sparse"} @property def _constructor_expanddim(self): # GH#31549 raising NotImplementedError on a property causes trouble # for `inspect` def constructor(*args, **kwargs): raise NotImplementedError("Not supported for DataFrames!") return constructor # ---------------------------------------------------------------------- # Constructors def __init__( self, data=None, index: Optional[Axes] = None, columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, copy: bool = False, ): if data is None: data = {} if dtype is not None: dtype = self._validate_dtype(dtype) if isinstance(data, DataFrame): data = data._mgr if isinstance(data, BlockManager): if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath NDFrame.__init__(self, data) return mgr = self._init_mgr( data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy ) elif isinstance(data, dict): mgr = init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords # masked recarray if isinstance(data, mrecords.MaskedRecords): mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) # a masked array else: mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: data_columns = list(data.dtype.names) data = {k: data[k] for k in data_columns} if columns is None: columns = data_columns mgr = init_dict(data, index, columns, dtype=dtype) elif getattr(data, "name", None) is not None: mgr = init_dict({data.name: data}, index, columns, dtype=dtype) else: mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) # For data is list-like, or Iterable (will consume into list) elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: if is_dataclass(data[0]): data = dataclasses_to_dicts(data) if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields arrays, columns = to_arrays(data, columns, dtype=dtype) columns = ensure_index(columns) # set the index if index is None: if isinstance(data[0], Series): index = get_names_from_index(data) elif isinstance(data[0], Categorical): index = ibase.default_index(len(data[0])) else: index = ibase.default_index(len(data)) mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) else: mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) else: mgr = init_dict({}, index, columns, dtype=dtype) # For data is scalar else: if index is None or columns is None: raise ValueError("DataFrame constructor not properly called!") if not dtype: dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) # For data is a scalar extension dtype if is_extension_array_dtype(dtype): values = [ construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) else: # Attempt to coerce to a numpy array try: arr = np.array(data, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: exc = TypeError( "DataFrame constructor called with " f"incompatible data and dtype: {err}" ) raise exc from err if arr.ndim != 0: raise ValueError("DataFrame constructor not properly called!") shape = (len(index), len(columns)) values = np.full(shape, arr) mgr = init_ndarray( values, index, columns, dtype=values.dtype, copy=False ) NDFrame.__init__(self, mgr) # ---------------------------------------------------------------------- @property def axes(self) -> List[Index]: """ Return a list representing the axes of the DataFrame. It has the row axis labels and column axis labels as the only members. They are returned in that order. Examples -------- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.axes [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], dtype='object')] """ return [self.index, self.columns] @property def shape(self) -> Tuple[int, int]: """ Return a tuple representing the dimensionality of the DataFrame. See Also -------- ndarray.shape : Tuple of array dimensions. Examples -------- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.shape (2, 2) >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], ... 'col3': [5, 6]}) >>> df.shape (2, 3) """ return len(self.index), len(self.columns) @property def _is_homogeneous_type(self) -> bool: """ Whether all the columns in a DataFrame have the same type. Returns ------- bool See Also -------- Index._is_homogeneous_type : Whether the object has a single dtype. MultiIndex._is_homogeneous_type : Whether all the levels of a MultiIndex have the same dtype. Examples -------- >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type True >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type False Items with the same type but different sizes are considered different types. >>> DataFrame({ ... "A": np.array([1, 2], dtype=np.int32), ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type False """ if self._mgr.any_extension_types: return len({block.dtype for block in self._mgr.blocks}) == 1 else: return not self._is_mixed_type @property def _can_fast_transpose(self) -> bool: """ Can we transpose this DataFrame without creating any new array objects. """ if self._mgr.any_extension_types: # TODO(EA2D) special case would be unnecessary with 2D EAs return False return len(self._mgr.blocks) == 1 # ---------------------------------------------------------------------- # Rendering Methods def _repr_fits_vertical_(self) -> bool: """ Check length against max_rows. """ max_rows = get_option("display.max_rows") return len(self) <= max_rows def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: """ Check if full repr fits in horizontal boundaries imposed by the display options width and max_columns. In case of non-interactive session, no boundaries apply. `ignore_width` is here so ipynb+HTML output can behave the way users expect. display.max_columns remains in effect. GH3541, GH3573 """ width, height = console.get_console_size() max_columns = get_option("display.max_columns") nb_columns = len(self.columns) # exceed max columns if (max_columns and nb_columns > max_columns) or ( (not ignore_width) and width and nb_columns > (width // 2) ): return False # used by repr_html under IPython notebook or scripts ignore terminal # dims if ignore_width or not console.in_interactive_session(): return True if get_option("display.width") is not None or console.in_ipython_frontend(): # check at least the column row for excessive width max_rows = 1 else: max_rows = get_option("display.max_rows") # when auto-detecting, so width=None and not in ipython front end # check whether repr fits horizontal by actually checking # the width of the rendered repr buf = StringIO() # only care about the stuff we'll actually print out # and to_string on entire frame may be expensive d = self if not (max_rows is None): # unlimited rows # min of two, where one may be None d = d.iloc[: min(max_rows, len(d))] else: return True d.to_string(buf=buf) value = buf.getvalue() repr_width = max(len(line) for line in value.split("\n")) return repr_width < width def _info_repr(self) -> bool: """ True if the repr should show the info view. """ info_repr_option = get_option("display.large_repr") == "info" return info_repr_option and not ( self._repr_fits_horizontal_() and self._repr_fits_vertical_() ) def __repr__(self) -> str: """ Return a string representation for a particular DataFrame. """ buf = StringIO("") if self._info_repr(): self.info(buf=buf) return buf.getvalue() max_rows = get_option("display.max_rows") min_rows = get_option("display.min_rows") max_cols = get_option("display.max_columns") max_colwidth = get_option("display.max_colwidth") show_dimensions = get_option("display.show_dimensions") if get_option("display.expand_frame_repr"): width, _ = console.get_console_size() else: width = None self.to_string( buf=buf, max_rows=max_rows, min_rows=min_rows, max_cols=max_cols, line_width=width, max_colwidth=max_colwidth, show_dimensions=show_dimensions, ) return buf.getvalue() def _repr_html_(self) -> Optional[str]: """ Return a html representation for a particular DataFrame. Mainly for IPython notebook. """ if self._info_repr(): buf = StringIO("") self.info(buf=buf) # need to escape the <class>, should be the first line. val = buf.getvalue().replace("<", r"&lt;", 1) val = val.replace(">", r"&gt;", 1) return "<pre>" + val + "</pre>" if get_option("display.notebook_repr_html"): max_rows = get_option("display.max_rows") min_rows = get_option("display.min_rows") max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") formatter = fmt.DataFrameFormatter( self, columns=None, col_space=None, na_rep="NaN", formatters=None, float_format=None, sparsify=None, justify=None, index_names=True, header=True, index=True, bold_rows=True, escape=True, max_rows=max_rows, min_rows=min_rows, max_cols=max_cols, show_dimensions=show_dimensions, decimal=".", ) return fmt.DataFrameRenderer(formatter).to_html(notebook=True) else: return None
[docs] @Substitution( header_type="bool or sequence", header="Write out the column names. If a list of strings " "is given, it is assumed to be aliases for the " "column names", col_space_type="int, list or dict of int", col_space="The minimum width of each column", ) @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) def to_string( self, buf: Optional[FilePathOrBuffer[str]] = None, columns: Optional[Sequence[str]] = None, col_space: Optional[int] = None, header: Union[bool, Sequence[str]] = True, index: bool = True, na_rep: str = "NaN", formatters: Optional[fmt.FormattersType] = None, float_format: Optional[fmt.FloatFormatType] = None, sparsify: Optional[bool] = None, index_names: bool = True, justify: Optional[str] = None, max_rows: Optional[int] = None, min_rows: Optional[int] = None, max_cols: Optional[int] = None, show_dimensions: bool = False, decimal: str = ".", line_width: Optional[int] = None, max_colwidth: Optional[int] = None, encoding: Optional[str] = None, ) -> Optional[str]: """ Render a DataFrame to a console-friendly tabular output. %(shared_params)s line_width : int, optional Width to wrap a line in characters. max_colwidth : int, optional Max width to truncate each column in characters. By default, no limit. .. versionadded:: 1.0.0 encoding : str, default "utf-8" Set character encoding. .. versionadded:: 1.0 %(returns)s See Also -------- to_html : Convert DataFrame to HTML. Examples -------- >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} >>> df = pd.DataFrame(d) >>> print(df.to_string()) col1 col2 0 1 4 1 2 5 2 3 6 """ from pandas import option_context with option_context("display.max_colwidth", max_colwidth): formatter = fmt.DataFrameFormatter( self, columns=columns, col_space=col_space, na_rep=na_rep, formatters=formatters, float_format=float_format, sparsify=sparsify, justify=justify, index_names=index_names, header=header, index=index, min_rows=min_rows, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, decimal=decimal, ) return fmt.DataFrameRenderer(formatter).to_string( buf=buf, encoding=encoding, line_width=line_width, )
# ---------------------------------------------------------------------- @property def style(self) -> Styler: """ Returns a Styler object. Contains methods for building a styled HTML representation of the DataFrame. See Also -------- io.formats.style.Styler : Helps style a DataFrame or Series according to the data with HTML and CSS. """ from pandas.io.formats.style import Styler return Styler(self) _shared_docs[ "items" ] = r""" Iterate over (column name, Series) pairs. Iterates over the DataFrame columns, returning a tuple with the column name and the content as a Series. Yields ------ label : object The column names for the DataFrame being iterated over. content : Series The column entries belonging to each label, as a Series. See Also -------- DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs. DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. Examples -------- >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'], ... 'population': [1864, 22000, 80000]}, ... index=['panda', 'polar', 'koala']) >>> df species population panda bear 1864 polar bear 22000 koala marsupial 80000 >>> for label, content in df.items(): ... print(f'label: {label}') ... print(f'content: {content}', sep='\n') ... label: species content: panda bear polar bear koala marsupial Name: species, dtype: object label: population content: panda 1864 polar 22000 koala 80000 Name: population, dtype: int64 """
[docs] @Appender(_shared_docs["items"]) def items(self) -> Iterable[Tuple[Label, Series]]: if self.columns.is_unique and hasattr(self, "_item_cache"): for k in self.columns: yield k, self._get_item_cache(k) else: for i, k in enumerate(self.columns): yield k, self._ixs(i, axis=1)
[docs] @Appender(_shared_docs["items"]) def iteritems(self) -> Iterable[Tuple[Label, Series]]: yield from self.items()
[docs] def iterrows(self) -> Iterable[Tuple[Label, Series]]: """ Iterate over DataFrame rows as (index, Series) pairs. Yields ------ index : label or tuple of label The index of the row. A tuple for a `MultiIndex`. data : Series The data of the row as a Series. See Also -------- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. DataFrame.items : Iterate over (column name, Series) pairs. Notes ----- 1. Because ``iterrows`` returns a Series for each row, it does **not** preserve dtypes across the rows (dtypes are preserved across columns for DataFrames). For example, >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) >>> row = next(df.iterrows())[1] >>> row int 1.0 float 1.5 Name: 0, dtype: float64 >>> print(row['int'].dtype) float64 >>> print(df['int'].dtype) int64 To preserve dtypes while iterating over the rows, it is better to use :meth:`itertuples` which returns namedtuples of the values and which is generally faster than ``iterrows``. 2. You should **never modify** something you are iterating over. This is not guaranteed to work in all cases. Depending on the data types, the iterator returns a copy and not a view, and writing to it will have no effect. """ columns = self.columns klass = self._constructor_sliced for k, v in zip(self.index, self.values): s = klass(v, index=columns, name=k) yield k, s
[docs] def itertuples(self, index: bool = True, name: Optional[str] = "Pandas"): """ Iterate over DataFrame rows as namedtuples. Parameters ---------- index : bool, default True If True, return the index as the first element of the tuple. name : str or None, default "Pandas" The name of the returned namedtuples or None to return regular tuples. Returns ------- iterator An object to iterate over namedtuples for each row in the DataFrame with the first field possibly being the index and following fields being the column values. See Also -------- DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs. DataFrame.items : Iterate over (column name, Series) pairs. Notes ----- The column names will be renamed to positional names if they are invalid Python identifiers, repeated, or start with an underscore. On python versions < 3.7 regular tuples are returned for DataFrames with a large number of columns (>254). Examples -------- >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, ... index=['dog', 'hawk']) >>> df num_legs num_wings dog 4 0 hawk 2 2 >>> for row in df.itertuples(): ... print(row) ... Pandas(Index='dog', num_legs=4, num_wings=0) Pandas(Index='hawk', num_legs=2, num_wings=2) By setting the `index` parameter to False we can remove the index as the first element of the tuple: >>> for row in df.itertuples(index=False): ... print(row) ... Pandas(num_legs=4, num_wings=0) Pandas(num_legs=2, num_wings=2) With the `name` parameter set we set a custom name for the yielded namedtuples: >>> for row in df.itertuples(name='Animal'): ... print(row) ... Animal(Index='dog', num_legs=4, num_wings=0) Animal(Index='hawk', num_legs=2, num_wings=2) """ arrays = [] fields = list(self.columns) if index: arrays.append(self.index) fields.insert(0, "Index") # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) if name is not None: # https://github.com/python/mypy/issues/9046 # error: namedtuple() expects a string literal as the first argument itertuple = collections.namedtuple( # type: ignore[misc] name, fields, rename=True ) return map(itertuple._make, zip(*arrays)) # fallback to regular tuples return zip(*arrays)
def __len__(self) -> int: """ Returns length of info axis, but here we use the index. """ return len(self.index)
[docs] def dot(self, other): """ Compute the matrix multiplication between the DataFrame and other. This method computes the matrix product between the DataFrame and the values of an other Series, DataFrame or a numpy array. It can also be called using ``self @ other`` in Python >= 3.5. Parameters ---------- other : Series, DataFrame or array-like The other object to compute the matrix product with. Returns ------- Series or DataFrame If other is a Series, return the matrix product between self and other as a Series. If other is a DataFrame or a numpy.array, return the matrix product of self and other in a DataFrame of a np.array. See Also -------- Series.dot: Similar method for Series. Notes ----- The dimensions of DataFrame and other must be compatible in order to compute the matrix multiplication. In addition, the column names of DataFrame and the index of other must contain the same values, as they will be aligned prior to the multiplication. The dot method for Series computes the inner product, instead of the matrix product here. Examples -------- Here we multiply a DataFrame with a Series. >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> s = pd.Series([1, 1, 2, 1]) >>> df.dot(s) 0 -4 1 5 dtype: int64 Here we multiply a DataFrame with another DataFrame. >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) >>> df.dot(other) 0 1 0 1 4 1 2 2 Note that the dot method give the same result as @ >>> df @ other 0 1 0 1 4 1 2 2 The dot method works also if other is an np.array. >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) >>> df.dot(arr) 0 1 0 1 4 1 2 2 Note how shuffling of the objects does not change the result. >>> s2 = s.reindex([1, 0, 2, 3]) >>> df.dot(s2) 0 -4 1 5 dtype: int64 """ if isinstance(other, (Series, DataFrame)): common = self.columns.union(other.index) if len(common) > len(self.columns) or len(common) > len(other.index): raise ValueError("matrices are not aligned") left = self.reindex(columns=common, copy=False) right = other.reindex(index=common, copy=False) lvals = left.values rvals = right._values else: left = self lvals = self.values rvals = np.asarray(other) if lvals.shape[1] != rvals.shape[0]: raise ValueError( f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" ) if isinstance(other, DataFrame): return self._constructor( np.dot(lvals, rvals), index=left.index, columns=other.columns ) elif isinstance(other, Series): return self._constructor_sliced(np.dot(lvals, rvals), index=left.index) elif isinstance(rvals, (np.ndarray, Index)): result = np.dot(lvals, rvals) if result.ndim == 2: return self._constructor(result, index=left.index) else: return self._constructor_sliced(result, index=left.index) else: # pragma: no cover raise TypeError(f"unsupported type: {type(other)}")
def __matmul__(self, other): """ Matrix multiplication using binary `@` operator in Python>=3.5. """ return self.dot(other) def __rmatmul__(self, other): """ Matrix multiplication using binary `@` operator in Python>=3.5. """ try: return self.T.dot(np.transpose(other)).T except ValueError as err: if "shape mismatch" not in str(err): raise # GH#21581 give exception message for original shapes msg = f"shapes {np.shape(other)} and {self.shape} not aligned" raise ValueError(msg) from err # ---------------------------------------------------------------------- # IO methods (to / from other formats)
[docs] @classmethod def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> DataFrame: """ Construct DataFrame from dict of array-like or dicts. Creates DataFrame object from dictionary by columns or by index allowing dtype specification. Parameters ---------- data : dict Of the form {field : array-like} or {field : dict}. orient : {'columns', 'index'}, default 'columns' The "orientation" of the data. If the keys of the passed dict should be the columns of the resulting DataFrame, pass 'columns' (default). Otherwise if the keys should be rows, pass 'index'. dtype : dtype, default None Data type to force, otherwise infer. columns : list, default None Column labels to use when ``orient='index'``. Raises a ValueError if used with ``orient='columns'``. Returns ------- DataFrame See Also -------- DataFrame.from_records : DataFrame from structured ndarray, sequence of tuples or dicts, or DataFrame. DataFrame : DataFrame object creation using constructor. Examples -------- By default the keys of the dict become the DataFrame columns: >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} >>> pd.DataFrame.from_dict(data) col_1 col_2 0 3 a 1 2 b 2 1 c 3 0 d Specify ``orient='index'`` to create the DataFrame using dictionary keys as rows: >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} >>> pd.DataFrame.from_dict(data, orient='index') 0 1 2 3 row_1 3 2 1 0 row_2 a b c d When using the 'index' orientation, the column names can be specified manually: >>> pd.DataFrame.from_dict(data, orient='index', ... columns=['A', 'B', 'C', 'D']) A B C D row_1 3 2 1 0 row_2 a b c d """ index = None orient = orient.lower() if orient == "index": if len(data) > 0: # TODO speed up Series case if isinstance(list(data.values())[0], (Series, dict)): data = _from_nested_dict(data) else: data, index = list(data.values()), list(data.keys()) elif orient == "columns": if columns is not None: raise ValueError("cannot use columns parameter with orient='columns'") else: # pragma: no cover raise ValueError("only recognize index or columns for orient") return cls(data, index=index, columns=columns, dtype=dtype)
[docs] def to_numpy( self, dtype=None, copy: bool = False, na_value=lib.no_default ) -> np.ndarray: """ Convert the DataFrame to a NumPy array. .. versionadded:: 0.24.0 By default, the dtype of the returned array will be the common NumPy dtype of all types in the DataFrame. For example, if the dtypes are ``float16`` and ``float32``, the results dtype will be ``float32``. This may require copying data and coercing values, which may be expensive. Parameters ---------- dtype : str or numpy.dtype, optional The dtype to pass to :meth:`numpy.asarray`. copy : bool, default False Whether to ensure that the returned value is not a view on another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. na_value : Any, optional The value to use for missing values. The default value depends on `dtype` and the dtypes of the DataFrame columns. .. versionadded:: 1.1.0 Returns ------- numpy.ndarray See Also -------- Series.to_numpy : Similar method for Series. Examples -------- >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() array([[1, 3], [2, 4]]) With heterogeneous data, the lowest common type will have to be used. >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) >>> df.to_numpy() array([[1. , 3. ], [2. , 4.5]]) For a mix of numeric and non-numeric types, the output array will have object dtype. >>> df['C'] = pd.date_range('2000', periods=2) >>> df.to_numpy() array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ self._consolidate_inplace() result = self._mgr.as_array( transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value ) if result.dtype is not dtype: result = np.array(result, dtype=dtype, copy=False) return result
[docs] def to_dict(self, orient="dict", into=dict): """ Convert the DataFrame to a dictionary. The type of the key-value pairs can be customized with the parameters (see below). Parameters ---------- orient : str {'dict', 'list', 'series', 'split', 'records', 'index'} Determines the type of the values of the dictionary. - 'dict' (default) : dict like {column -> {index -> value}} - 'list' : dict like {column -> [values]} - 'series' : dict like {column -> Series(values)} - 'split' : dict like {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} - 'records' : list like [{column -> value}, ... , {column -> value}] - 'index' : dict like {index -> {column -> value}} Abbreviations are allowed. `s` indicates `series` and `sp` indicates `split`. into : class, default dict The collections.abc.Mapping subclass used for all Mappings in the return value. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. Returns ------- dict, list or collections.abc.Mapping Return a collections.abc.Mapping object representing the DataFrame. The resulting transformation depends on the `orient` parameter. See Also -------- DataFrame.from_dict: Create a DataFrame from a dictionary. DataFrame.to_json: Convert a DataFrame to JSON format. Examples -------- >>> df = pd.DataFrame({'col1': [1, 2], ... 'col2': [0.5, 0.75]}, ... index=['row1', 'row2']) >>> df col1 col2 row1 1 0.50 row2 2 0.75 >>> df.to_dict() {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} You can specify the return orientation. >>> df.to_dict('series') {'col1': row1 1 row2 2 Name: col1, dtype: int64, 'col2': row1 0.50 row2 0.75 Name: col2, dtype: float64} >>> df.to_dict('split') {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 'data': [[1, 0.5], [2, 0.75]]} >>> df.to_dict('records') [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] >>> df.to_dict('index') {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} You can also specify the mapping type. >>> from collections import OrderedDict, defaultdict >>> df.to_dict(into=OrderedDict) OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) If you want a `defaultdict`, you need to initialize it: >>> dd = defaultdict(list) >>> df.to_dict('records', into=dd) [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}), defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})] """ if not self.columns.is_unique: warnings.warn( "DataFrame columns are not unique, some columns will be omitted.", UserWarning, stacklevel=2, ) # GH16122 into_c = com.standardize_mapping(into) orient = orient.lower() # GH32515 if orient.startswith(("d", "l", "s", "r", "i")) and orient not in { "dict", "list", "series", "split", "records", "index", }: warnings.warn( "Using short name for 'orient' is deprecated. Only the " "options: ('dict', list, 'series', 'split', 'records', 'index') " "will be used in a future version. Use one of the above " "to silence this warning.", FutureWarning, ) if orient.startswith("d"): orient = "dict" elif orient.startswith("l"): orient = "list" elif orient.startswith("sp"): orient = "split" elif orient.startswith("s"): orient = "series" elif orient.startswith("r"): orient = "records" elif orient.startswith("i"): orient = "index" if orient == "dict": return into_c((k, v.to_dict(into)) for k, v in self.items()) elif orient == "list": return into_c((k, v.tolist()) for k, v in self.items()) elif orient == "split": return into_c( ( ("index", self.index.tolist()), ("columns", self.columns.tolist()), ( "data", [ list(map(maybe_box_datetimelike, t)) for t in self.itertuples(index=False, name=None) ], ), ) ) elif orient == "series": return into_c((k, maybe_box_datetimelike(v)) for k, v in self.items()) elif orient == "records": columns = self.columns.tolist() rows = ( dict(zip(columns, row)) for row in self.itertuples(index=False, name=None) ) return [ into_c((k, maybe_box_datetimelike(v)) for k, v in row.items()) for row in rows ] elif orient == "index": if not self.index.is_unique: raise ValueError("DataFrame index must be unique for orient='index'.") return into_c( (t[0], dict(zip(self.columns, t[1:]))) for t in self.itertuples(name=None) ) else: raise ValueError(f"orient '{orient}' not understood")
[docs] def to_gbq( self, destination_table, project_id=None, chunksize=None, reauth=False, if_exists="fail", auth_local_webserver=False, table_schema=None, location=None, progress_bar=True, credentials=None, ) -> None: """ Write a DataFrame to a Google BigQuery table. This function requires the `pandas-gbq package <https://pandas-gbq.readthedocs.io>`__. See the `How to authenticate with Google BigQuery <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__ guide for authentication instructions. Parameters ---------- destination_table : str Name of table to be written, in the form ``dataset.tablename``. project_id : str, optional Google BigQuery Account project ID. Optional when available from the environment. chunksize : int, optional Number of rows to be inserted in each chunk from the dataframe. Set to ``None`` to load the whole dataframe at once. reauth : bool, default False Force Google BigQuery to re-authenticate the user. This is useful if multiple accounts are used. if_exists : str, default 'fail' Behavior when the destination table exists. Value can be one of: ``'fail'`` If table exists raise pandas_gbq.gbq.TableCreationError. ``'replace'`` If table exists, drop it, recreate it, and insert data. ``'append'`` If table exists, insert data. Create if does not exist. auth_local_webserver : bool, default False Use the `local webserver flow`_ instead of the `console flow`_ when getting user credentials. .. _local webserver flow: https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server .. _console flow: https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console *New in version 0.2.0 of pandas-gbq*. table_schema : list of dicts, optional List of BigQuery table fields to which according DataFrame columns conform to, e.g. ``[{'name': 'col1', 'type': 'STRING'},...]``. If schema is not provided, it will be generated according to dtypes of DataFrame columns. See BigQuery API documentation on available names of a field. *New in version 0.3.1 of pandas-gbq*. location : str, optional Location where the load job should run. See the `BigQuery locations documentation <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a list of available locations. The location must match that of the target dataset. *New in version 0.5.0 of pandas-gbq*. progress_bar : bool, default True Use the library `tqdm` to show the progress bar for the upload, chunk by chunk. *New in version 0.5.0 of pandas-gbq*. credentials : google.auth.credentials.Credentials, optional Credentials for accessing Google APIs. Use this parameter to override default credentials, such as to use Compute Engine :class:`google.auth.compute_engine.Credentials` or Service Account :class:`google.oauth2.service_account.Credentials` directly. *New in version 0.8.0 of pandas-gbq*. .. versionadded:: 0.24.0 See Also -------- pandas_gbq.to_gbq : This function in the pandas-gbq library. read_gbq : Read a DataFrame from Google BigQuery. """ from pandas.io import gbq gbq.to_gbq( self, destination_table, project_id=project_id, chunksize=chunksize, reauth=reauth, if_exists=if_exists, auth_local_webserver=auth_local_webserver, table_schema=table_schema, location=location, progress_bar=progress_bar, credentials=credentials, )
[docs] @classmethod def from_records( cls, data, index=None, exclude=None, columns=None, coerce_float=False, nrows=None, ) -> DataFrame: """ Convert structured or record ndarray to DataFrame. Creates a DataFrame object from a structured ndarray, sequence of tuples or dicts, or DataFrame. Parameters ---------- data : structured ndarray, sequence of tuples or dicts, or DataFrame Structured input data. index : str, list of fields, array-like Field of array to use as the index, alternately a specific set of input labels to use. exclude : sequence, default None Columns or fields to exclude. columns : sequence, default None Column names to use. If the passed data do not have names associated with them, this argument provides names for the columns. Otherwise this argument indicates the order of the columns in the result (any names not found in the data will become all-NA columns). coerce_float : bool, default False Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. nrows : int, default None Number of rows to read if data is an iterator. Returns ------- DataFrame See Also -------- DataFrame.from_dict : DataFrame from dict of array-like or dicts. DataFrame : DataFrame object creation using constructor. Examples -------- Data can be provided as a structured ndarray: >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], ... dtype=[('col_1', 'i4'), ('col_2', 'U1')]) >>> pd.DataFrame.from_records(data) col_1 col_2 0 3 a 1 2 b 2 1 c 3 0 d Data can be provided as a list of dicts: >>> data = [{'col_1': 3, 'col_2': 'a'}, ... {'col_1': 2, 'col_2': 'b'}, ... {'col_1': 1, 'col_2': 'c'}, ... {'col_1': 0, 'col_2': 'd'}] >>> pd.DataFrame.from_records(data) col_1 col_2 0 3 a 1 2 b 2 1 c 3 0 d Data can be provided as a list of tuples with corresponding columns: >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')] >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2']) col_1 col_2 0 3 a 1 2 b 2 1 c 3 0 d """ # Make a copy of the input columns so we can modify it if columns is not None: columns = ensure_index(columns) if is_iterator(data): if nrows == 0: return cls() try: first_row = next(data) except StopIteration: return cls(index=index, columns=columns) dtype = None if hasattr(first_row, "dtype") and first_row.dtype.names: dtype = first_row.dtype values = [first_row] if nrows is None: values += data else: values.extend(itertools.islice(data, nrows - 1)) if dtype is not None: data = np.array(values, dtype=dtype) else: data = values if isinstance(data, dict): if columns is None: columns = arr_columns = ensure_index(sorted(data)) arrays = [data[k] for k in columns] else: arrays = [] arr_columns_list = [] for k, v in data.items(): if k in columns: arr_columns_list.append(k) arrays.append(v) arrays, arr_columns = reorder_arrays(arrays, arr_columns_list, columns) elif isinstance(data, (np.ndarray, DataFrame)): arrays, columns = to_arrays(data, columns) if columns is not None: columns = ensure_index(columns) arr_columns = columns else: arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float) arr_columns = ensure_index(arr_columns) if columns is not None: columns = ensure_index(columns) else: columns = arr_columns if exclude is None: exclude = set() else: exclude = set(exclude) result_index = None if index is not None: if isinstance(index, str) or not hasattr(index, "__iter__"): i = columns.get_loc(index) exclude.add(index) if len(arrays) > 0: result_index = Index(arrays[i], name=index) else: result_index = Index([], name=index) else: try: index_data = [arrays[arr_columns.get_loc(field)] for field in index] except (KeyError, TypeError): # raised by get_loc, see GH#29258 result_index = index else: result_index = ensure_index_from_sequences(index_data, names=index) exclude.update(index) if any(exclude): arr_exclude = [x for x in exclude if x in arr_columns] to_remove = [arr_columns.get_loc(col) for col in arr_exclude] arrays = [v for i, v in enumerate(arrays) if i not in to_remove] arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) return cls(mgr)
[docs] def to_records( self, index=True, column_dtypes=None, index_dtypes=None ) -> np.recarray: """ Convert DataFrame to a NumPy record array. Index will be included as the first field of the record array if requested. Parameters ---------- index : bool, default True Include index in resulting record array, stored in 'index' field or using the index label, if set. column_dtypes : str, type, dict, default None .. versionadded:: 0.24.0 If a string or type, the data type to store all columns. If a dictionary, a mapping of column names and indices (zero-indexed) to specific data types. index_dtypes : str, type, dict, default None .. versionadded:: 0.24.0 If a string or type, the data type to store all index levels. If a dictionary, a mapping of index level names and indices (zero-indexed) to specific data types. This mapping is applied only if `index=True`. Returns ------- numpy.recarray NumPy ndarray with the DataFrame labels as fields and each row of the DataFrame as entries. See Also -------- DataFrame.from_records: Convert structured or record ndarray to DataFrame. numpy.recarray: An ndarray that allows field access using attributes, analogous to typed columns in a spreadsheet. Examples -------- >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, ... index=['a', 'b']) >>> df A B a 1 0.50 b 2 0.75 >>> df.to_records() rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')]) If the DataFrame index has no label then the recarray field name is set to 'index'. If the index has a label then this is used as the field name: >>> df.index = df.index.rename("I") >>> df.to_records() rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')]) The index can be excluded from the record array: >>> df.to_records(index=False) rec.array([(1, 0.5 ), (2, 0.75)], dtype=[('A', '<i8'), ('B', '<f8')]) Data types can be specified for the columns: >>> df.to_records(column_dtypes={"A": "int32"}) rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')]) As well as for the index: >>> df.to_records(index_dtypes="<S2") rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')]) >>> index_dtypes = f"<S{df.index.str.len().max()}" >>> df.to_records(index_dtypes=index_dtypes) rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')]) """ if index: if isinstance(self.index, MultiIndex): # array of tuples to numpy cols. copy copy copy ix_vals = list(map(np.array, zip(*self.index._values))) else: ix_vals = [self.index.values] arrays = ix_vals + [ np.asarray(self.iloc[:, i]) for i in range(len(self.columns)) ] count = 0 index_names = list(self.index.names) if isinstance(self.index, MultiIndex): for i, n in enumerate(index_names): if n is None: index_names[i] = f"level_{count}" count += 1 elif index_names[0] is None: index_names = ["index"] names = [str(name) for name in itertools.chain(index_names, self.columns)] else: arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))] names = [str(c) for c in self.columns] index_names = [] index_len = len(index_names) formats = [] for i, v in enumerate(arrays): index = i # When the names and arrays are collected, we # first collect those in the DataFrame's index, # followed by those in its columns. # # Thus, the total length of the array is: # len(index_names) + len(DataFrame.columns). # # This check allows us to see whether we are # handling a name / array in the index or column. if index < index_len: dtype_mapping = index_dtypes name = index_names[index] else: index -= index_len dtype_mapping = column_dtypes name = self.columns[index] # We have a dictionary, so we get the data type # associated with the index or column (which can # be denoted by its name in the DataFrame or its # position in DataFrame's array of indices or # columns, whichever is applicable. if is_dict_like(dtype_mapping): if name in dtype_mapping: dtype_mapping = dtype_mapping[name] elif index in dtype_mapping: dtype_mapping = dtype_mapping[index] else: dtype_mapping = None # If no mapping can be found, use the array's # dtype attribute for formatting. # # A valid dtype must either be a type or # string naming a type. if dtype_mapping is None: formats.append(v.dtype) elif isinstance(dtype_mapping, (type, np.dtype, str)): formats.append(dtype_mapping) else: element = "row" if i < index_len else "column" msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}" raise ValueError(msg) return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
@classmethod def _from_arrays( cls, arrays, columns, index, dtype: Optional[Dtype] = None, verify_integrity: bool = True, ) -> DataFrame: """ Create DataFrame from a list of arrays corresponding to the columns. Parameters ---------- arrays : list-like of arrays Each array in the list corresponds to one column, in order. columns : list-like, Index The column names for the resulting DataFrame. index : list-like, Index The rows labels for the resulting DataFrame. dtype : dtype, optional Optional dtype to enforce for all arrays. verify_integrity : bool, default True Validate and homogenize all input. If set to False, it is assumed that all elements of `arrays` are actual arrays how they will be stored in a block (numpy ndarray or ExtensionArray), have the same length as and are aligned with the index, and that `columns` and `index` are ensured to be an Index object. Returns ------- DataFrame """ if dtype is not None: dtype = pandas_dtype(dtype) mgr = arrays_to_mgr( arrays, columns, index, columns, dtype=dtype, verify_integrity=verify_integrity, ) return cls(mgr)
[docs] @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_stata( self, path: FilePathOrBuffer, convert_dates: Optional[Dict[Label, str]] = None, write_index: bool = True, byteorder: Optional[str] = None, time_stamp: Optional[datetime.datetime] = None, data_label: Optional[str] = None, variable_labels: Optional[Dict[Label, str]] = None, version: Optional[int] = 114, convert_strl: Optional[Sequence[Label]] = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> None: """ Export DataFrame object to Stata dta format. Writes the DataFrame to a Stata dataset file. "dta" files contain a Stata dataset. Parameters ---------- path : str, buffer or path object String, path object (pathlib.Path or py._path.local.LocalPath) or object implementing a binary write() function. If using a buffer then the buffer will not be automatically closed after the file data has been written. .. versionchanged:: 1.0.0 Previously this was "fname" convert_dates : dict Dictionary mapping columns containing datetime types to stata internal format to use when writing the dates. Options are 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. Datetime columns that do not have a conversion type specified will be converted to 'tc'. Raises NotImplementedError if a datetime column has timezone information. write_index : bool Write the index to Stata dataset. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder`. time_stamp : datetime A datetime to use as file creation date. Default is the current time. data_label : str, optional A label for the data set. Must be 80 characters or smaller. variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. version : {{114, 117, 118, 119, None}}, default 114 Version to use in the output dta file. Set to None to let pandas decide between 118 or 119 formats depending on the number of columns in the frame. Version 114 can be read by Stata 10 and later. Version 117 can be read by Stata 13 or later. Version 118 is supported in Stata 14 and later. Version 119 is supported in Stata 15 and later. Version 114 limits string variables to 244 characters or fewer while versions 117 and later allow strings with lengths up to 2,000,000 characters. Versions 118 and 119 support Unicode characters, and version 119 supports more than 32,767 variables. Version 119 should usually only be used when the number of variables exceeds the capacity of dta format 118. Exporting smaller datasets in format 119 may have unintended consequences, and, as of November 2020, Stata SE cannot read version 119 files. .. versionchanged:: 1.0.0 Added support for formats 118 and 119. convert_strl : list, optional List of column names to convert to string columns to Stata StrL format. Only available if version is 117. Storing strings in the StrL format can produce smaller dta files if strings have more than 8 characters and values are repeated. compression : str or dict, default 'infer' For on-the-fly compression of the output dta. If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and `fname` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If dict and compression mode is one of {{'zip', 'gzip', 'bz2'}}, or inferred as one of the above, other entries passed as additional compression options. .. versionadded:: 1.1.0 {storage_options} .. versionadded:: 1.2.0 Raises ------ NotImplementedError * If datetimes contain timezone information * Column dtype is not representable in Stata ValueError * Columns listed in convert_dates are neither datetime64[ns] or datetime.datetime * Column listed in convert_dates is not in DataFrame * Categorical label contains more than 32,000 characters See Also -------- read_stata : Import Stata data files. io.stata.StataWriter : Low-level writer for Stata data files. io.stata.StataWriter117 : Low-level writer for version 117 files. Examples -------- >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', ... 'parrot'], ... 'speed': [350, 18, 361, 15]}}) >>> df.to_stata('animals.dta') # doctest: +SKIP """ if version not in (114, 117, 118, 119, None): raise ValueError("Only formats 114, 117, 118 and 119 are supported.") if version == 114: if convert_strl is not None: raise ValueError("strl is not supported in format 114") from pandas.io.stata import StataWriter as statawriter elif version == 117: # mypy: Name 'statawriter' already defined (possibly by an import) from pandas.io.stata import ( # type: ignore[no-redef] StataWriter117 as statawriter, ) else: # versions 118 and 119 # mypy: Name 'statawriter' already defined (possibly by an import) from pandas.io.stata import ( # type: ignore[no-redef] StataWriterUTF8 as statawriter, ) kwargs: Dict[str, Any] = {} if version is None or version >= 117: # strl conversion is only supported >= 117 kwargs["convert_strl"] = convert_strl if version is None or version >= 118: # Specifying the version is only supported for UTF8 (118 or 119) kwargs["version"] = version # mypy: Too many arguments for "StataWriter" writer = statawriter( # type: ignore[call-arg] path, self, convert_dates=convert_dates, byteorder=byteorder, time_stamp=time_stamp, data_label=data_label, write_index=write_index, variable_labels=variable_labels, compression=compression, storage_options=storage_options, **kwargs, ) writer.write_file()
[docs] @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None: """ Write a DataFrame to the binary Feather format. Parameters ---------- path : str or file-like object If a string, it will be used as Root Directory path. **kwargs : Additional keywords passed to :func:`pyarrow.feather.write_feather`. Starting with pyarrow 0.17, this includes the `compression`, `compression_level`, `chunksize` and `version` keywords. .. versionadded:: 1.1.0 """ from pandas.io.feather_format import to_feather to_feather(self, path, **kwargs)
[docs] @doc( Series.to_markdown, klass=_shared_doc_kwargs["klass"], storage_options=_shared_docs["storage_options"], examples="""Examples -------- >>> df = pd.DataFrame( ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} ... ) >>> print(df.to_markdown()) | | animal_1 | animal_2 | |---:|:-----------|:-----------| | 0 | elk | dog | | 1 | pig | quetzal | Output markdown with a tabulate option. >>> print(df.to_markdown(tablefmt="grid")) +----+------------+------------+ | | animal_1 | animal_2 | +====+============+============+ | 0 | elk | dog | +----+------------+------------+ | 1 | pig | quetzal | +----+------------+------------+ """, ) def to_markdown( self, buf: Optional[Union[IO[str], str]] = None, mode: str = "wt", index: bool = True, storage_options: StorageOptions = None, **kwargs, ) -> Optional[str]: if "showindex" in kwargs: warnings.warn( "'showindex' is deprecated. Only 'index' will be used " "in a future version. Use 'index' to silence this warning.", FutureWarning, stacklevel=2, ) kwargs.setdefault("headers", "keys") kwargs.setdefault("tablefmt", "pipe") kwargs.setdefault("showindex", index) tabulate = import_optional_dependency("tabulate") result = tabulate.tabulate(self, **kwargs) if buf is None: return result with get_handle(buf, mode, storage_options=storage_options) as handles: assert not isinstance(handles.handle, (str, mmap.mmap)) handles.handle.writelines(result) return None
[docs] @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, path: Optional[FilePathOrBuffer] = None, engine: str = "auto", compression: Optional[str] = "snappy", index: Optional[bool] = None, partition_cols: Optional[List[str]] = None, storage_options: StorageOptions = None, **kwargs, ) -> Optional[bytes]: """ Write a DataFrame to the binary parquet format. This function writes the dataframe as a `parquet file <https://parquet.apache.org/>`_. You can choose different parquet backends, and have the option of compression. See :ref:`the user guide <io.parquet>` for more details. Parameters ---------- path : str or file-like object, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle (e.g. via builtin open function) or io.BytesIO. The engine fastparquet does not accept file-like objects. If path is None, a bytes object is returned. .. versionchanged:: 1.2.0 Previously this was "fname" engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. If ``None``, similar to ``True`` the dataframe's index(es) will be saved. However, instead of being saved as values, the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. .. versionadded:: 0.24.0 partition_cols : list, optional, default None Column names by which to partition the dataset. Columns are partitioned in the order they are given. Must be None if path is not a string. .. versionadded:: 0.24.0 {storage_options} .. versionadded:: 1.2.0 **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io <io.parquet>` for more details. Returns ------- bytes if no path argument is provided else None See Also -------- read_parquet : Read a parquet file. DataFrame.to_csv : Write a csv file. DataFrame.to_sql : Write to a sql table. DataFrame.to_hdf : Write to hdf. Notes ----- This function requires either the `fastparquet <https://pypi.org/project/fastparquet>`_ or `pyarrow <https://arrow.apache.org/docs/python/>`_ library. Examples -------- >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) >>> df.to_parquet('df.parquet.gzip', ... compression='gzip') # doctest: +SKIP >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP col1 col2 0 1 3 1 2 4 If you want to get a buffer to the parquet content you can use a io.BytesIO object, as long as you don't use partition_cols, which creates multiple files. >>> import io >>> f = io.BytesIO() >>> df.to_parquet(f) >>> f.seek(0) 0 >>> content = f.read() """ from pandas.io.parquet import to_parquet return to_parquet( self, path, engine, compression=compression, index=index, partition_cols=partition_cols, storage_options=storage_options, **kwargs, )
[docs] @Substitution( header_type="bool", header="Whether to print column labels, default True", col_space_type="str or int, list or dict of int or str", col_space="The minimum width of each column in CSS length " "units. An int is assumed to be px units.\n\n" " .. versionadded:: 0.25.0\n" " Ability to use str", ) @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) def to_html( self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep="NaN", formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, max_rows=None, max_cols=None, show_dimensions=False, decimal=".", bold_rows=True, classes=None, escape=True, notebook=False, border=None, table_id=None, render_links=False, encoding=None, ): """ Render a DataFrame as an HTML table. %(shared_params)s bold_rows : bool, default True Make the row labels bold in the output. classes : str or list or tuple, default None CSS class(es) to apply to the resulting html table. escape : bool, default True Convert the characters <, >, and & to HTML-safe sequences. notebook : {True, False}, default False Whether the generated HTML is for IPython Notebook. border : int A ``border=border`` attribute is included in the opening `<table>` tag. Default ``pd.options.display.html.border``. encoding : str, default "utf-8" Set character encoding. .. versionadded:: 1.0 table_id : str, optional A css id is included in the opening `<table>` tag if specified. render_links : bool, default False Convert URLs to HTML links. .. versionadded:: 0.24.0 %(returns)s See Also -------- to_string : Convert DataFrame to a string. """ if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: raise ValueError("Invalid value for justify parameter") formatter = fmt.DataFrameFormatter( self, columns=columns, col_space=col_space, na_rep=na_rep, header=header, index=index, formatters=formatters, float_format=float_format, bold_rows=bold_rows, sparsify=sparsify, justify=justify, index_names=index_names, escape=escape, decimal=decimal, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, ) # TODO: a generic formatter wld b in DataFrameFormatter return fmt.DataFrameRenderer(formatter).to_html( buf=buf, classes=classes, notebook=notebook, border=border, encoding=encoding, table_id=table_id, render_links=render_links, )
# ----------------------------------------------------------------------
[docs] @Substitution( klass="DataFrame", type_sub=" and columns", max_cols_sub=dedent( """\ max_cols : int, optional When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output is used. By default, the setting in ``pandas.options.display.max_info_columns`` is used.""" ), show_counts_sub=dedent( """\ show_counts : bool, optional Whether to show the non-null counts. By default, this is shown only if the DataFrame is smaller than ``pandas.options.display.max_info_rows`` and ``pandas.options.display.max_info_columns``. A value of True always shows the counts, and False never shows the counts. null_counts : bool, optional .. deprecated:: 1.2.0 Use show_counts instead.""" ), examples_sub=dedent( """\ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, ... "float_col": float_values}) >>> df int_col text_col float_col 0 1 alpha 0.00 1 2 beta 0.25 2 3 gamma 0.50 3 4 delta 0.75 4 5 epsilon 1.00 Prints information of all columns: >>> df.info(verbose=True) <class 'pandas.core.frame.DataFrame'> RangeIndex: 5 entries, 0 to 4 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 int_col 5 non-null int64 1 text_col 5 non-null object 2 float_col 5 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 248.0+ bytes Prints a summary of columns count and its dtypes but not per column information: >>> df.info(verbose=False) <class 'pandas.core.frame.DataFrame'> RangeIndex: 5 entries, 0 to 4 Columns: 3 entries, int_col to float_col dtypes: float64(1), int64(1), object(1) memory usage: 248.0+ bytes Pipe output of DataFrame.info to buffer instead of sys.stdout, get buffer content and writes to a text file: >>> import io >>> buffer = io.StringIO() >>> df.info(buf=buffer) >>> s = buffer.getvalue() >>> with open("df_info.txt", "w", ... encoding="utf-8") as f: # doctest: +SKIP ... f.write(s) 260 The `memory_usage` parameter allows deep introspection mode, specially useful for big DataFrames and fine-tune memory optimization: >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) >>> df = pd.DataFrame({ ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) ... }) >>> df.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 column_1 1000000 non-null object 1 column_2 1000000 non-null object 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 22.9+ MB >>> df.info(memory_usage='deep') <class 'pandas.core.frame.DataFrame'> RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 column_1 1000000 non-null object 1 column_2 1000000 non-null object 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 165.9 MB""" ), see_also_sub=dedent( """\ DataFrame.describe: Generate descriptive statistics of DataFrame columns. DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), version_added_sub="", ) @doc(BaseInfo.render) def info( self, verbose: Optional[bool] = None, buf: Optional[IO[str]] = None, max_cols: Optional[int] = None, memory_usage: Optional[Union[bool, str]] = None, show_counts: Optional[bool] = None, null_counts: Optional[bool] = None, ) -> None: if null_counts is not None: if show_counts is not None: raise ValueError("null_counts used with show_counts. Use show_counts.") warnings.warn( "null_counts is deprecated. Use show_counts instead", FutureWarning, stacklevel=2, ) show_counts = null_counts info = DataFrameInfo( data=self, memory_usage=memory_usage, ) info.render( buf=buf, max_cols=max_cols, verbose=verbose, show_counts=show_counts, )
[docs] def memory_usage(self, index=True, deep=False) -> Series: """ Return the memory usage of each column in bytes. The memory usage can optionally include the contribution of the index and elements of `object` dtype. This value is displayed in `DataFrame.info` by default. This can be suppressed by setting ``pandas.options.display.memory_usage`` to False. Parameters ---------- index : bool, default True Specifies whether to include the memory usage of the DataFrame's index in returned Series. If ``index=True``, the memory usage of the index is the first item in the output. deep : bool, default False If True, introspect the data deeply by interrogating `object` dtypes for system-level memory consumption, and include it in the returned values. Returns ------- Series A Series whose index is the original column names and whose values is the memory usage of each column in bytes. See Also -------- numpy.ndarray.nbytes : Total bytes consumed by the elements of an ndarray. Series.memory_usage : Bytes consumed by a Series. Categorical : Memory-efficient array for string values with many repeated values. DataFrame.info : Concise summary of a DataFrame. Examples -------- >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) ... for t in dtypes]) >>> df = pd.DataFrame(data) >>> df.head() int64 float64 complex128 object bool 0 1 1.0 1.0+0.0j 1 True 1 1 1.0 1.0+0.0j 1 True 2 1 1.0 1.0+0.0j 1 True 3 1 1.0 1.0+0.0j 1 True 4 1 1.0 1.0+0.0j 1 True >>> df.memory_usage() Index 128 int64 40000 float64 40000 complex128 80000 object 40000 bool 5000 dtype: int64 >>> df.memory_usage(index=False) int64 40000 float64 40000 complex128 80000 object 40000 bool 5000 dtype: int64 The memory footprint of `object` dtype columns is ignored by default: >>> df.memory_usage(deep=True) Index 128 int64 40000 float64 40000 complex128 80000 object 180000 bool 5000 dtype: int64 Use a Categorical for efficient storage of an object-dtype column with many repeated values. >>> df['object'].astype('category').memory_usage(deep=True) 5244 """ result = self._constructor_sliced( [c.memory_usage(index=False, deep=deep) for col, c in self.items()], index=self.columns, ) if index: result = self._constructor_sliced( self.index.memory_usage(deep=deep), index=["Index"] ).append(result) return result
[docs] def transpose(self, *args, copy: bool = False) -> DataFrame: """ Transpose index and columns. Reflect the DataFrame over its main diagonal by writing rows as columns and vice-versa. The property :attr:`.T` is an accessor to the method :meth:`transpose`. Parameters ---------- *args : tuple, optional Accepted for compatibility with NumPy. copy : bool, default False Whether to copy the data after transposing, even for DataFrames with a single dtype. Note that a copy is always required for mixed dtype DataFrames, or for DataFrames with any extension types. Returns ------- DataFrame The transposed DataFrame. See Also -------- numpy.transpose : Permute the dimensions of a given array. Notes ----- Transposing a DataFrame with mixed dtypes will result in a homogeneous DataFrame with the `object` dtype. In such a case, a copy of the data is always made. Examples -------- **Square DataFrame with homogeneous dtype** >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} >>> df1 = pd.DataFrame(data=d1) >>> df1 col1 col2 0 1 3 1 2 4 >>> df1_transposed = df1.T # or df1.transpose() >>> df1_transposed 0 1 col1 1 2 col2 3 4 When the dtype is homogeneous in the original DataFrame, we get a transposed DataFrame with the same dtype: >>> df1.dtypes col1 int64 col2 int64 dtype: object >>> df1_transposed.dtypes 0 int64 1 int64 dtype: object **Non-square DataFrame with mixed dtypes** >>> d2 = {'name': ['Alice', 'Bob'], ... 'score': [9.5, 8], ... 'employed': [False, True], ... 'kids': [0, 0]} >>> df2 = pd.DataFrame(data=d2) >>> df2 name score employed kids 0 Alice 9.5 False 0 1 Bob 8.0 True 0 >>> df2_transposed = df2.T # or df2.transpose() >>> df2_transposed 0 1 name Alice Bob score 9.5 8.0 employed False True kids 0 0 When the DataFrame has mixed dtypes, we get a transposed DataFrame with the `object` dtype: >>> df2.dtypes name object score float64 employed bool kids int64 dtype: object >>> df2_transposed.dtypes 0 object 1 object dtype: object """ nv.validate_transpose(args, {}) # construct the args dtypes = list(self.dtypes) if self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]): # We have EAs with the same dtype. We can preserve that dtype in transpose. dtype = dtypes[0] arr_type = dtype.construct_array_type() values = self.values new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] result = self._constructor( dict(zip(self.index, new_values)), index=self.columns ) else: new_values = self.values.T if copy: new_values = new_values.copy() result = self._constructor( new_values, index=self.columns, columns=self.index ) return result.__finalize__(self, method="transpose")
@property def T(self) -> DataFrame: return self.transpose() # ---------------------------------------------------------------------- # Indexing Methods def _ixs(self, i: int, axis: int = 0): """ Parameters ---------- i : int axis : int Notes ----- If slice passed, the resulting data will be a view. """ # irow if axis == 0: new_values = self._mgr.fast_xs(i) # if we are a copy, mark as such copy = isinstance(new_values, np.ndarray) and new_values.base is None result = self._constructor_sliced( new_values, index=self.columns, name=self.index[i], dtype=new_values.dtype, ) result._set_is_copy(self, copy=copy) return result # icol else: label = self.columns[i] values = self._mgr.iget(i) result = self._box_col_values(values, i) # this is a cached value, mark it so result._set_as_cached(label, self) return result def _get_column_array(self, i: int) -> ArrayLike: """ Get the values of the i'th column (ndarray or ExtensionArray, as stored in the Block) """ return self._mgr.iget_values(i) def _iter_column_arrays(self) -> Iterator[ArrayLike]: """ Iterate over the arrays of all columns in order. This returns the values as stored in the Block (ndarray or ExtensionArray). """ for i in range(len(self.columns)): yield self._get_column_array(i) def __getitem__(self, key): key = lib.item_from_zerodim(key) key = com.apply_if_callable(key, self) if is_hashable(key): # shortcut if the key is in columns if self.columns.is_unique and key in self.columns: if isinstance(self.columns, MultiIndex): return self._getitem_multilevel(key) return self._get_item_cache(key) # Do we have a slicer (on rows)? indexer = convert_to_index_sliceable(self, key) if indexer is not None: if isinstance(indexer, np.ndarray): indexer = lib.maybe_indices_to_slice( indexer.astype(np.intp, copy=False), len(self) ) # either we have a slice or we have a string that can be converted # to a slice for partial-string date indexing return self._slice(indexer, axis=0) # Do we have a (boolean) DataFrame? if isinstance(key, DataFrame): return self.where(key) # Do we have a (boolean) 1d indexer? if com.is_bool_indexer(key): return self._getitem_bool_array(key) # We are left with two options: a single key, and a collection of keys, # We interpret tuples as collections only for non-MultiIndex is_single_key = isinstance(key, tuple) or not is_list_like(key) if is_single_key: if self.columns.nlevels > 1: return self._getitem_multilevel(key) indexer = self.columns.get_loc(key) if is_integer(indexer): indexer = [indexer] else: if is_iterator(key): key = list(key) indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1] # take() does not accept boolean indexers if getattr(indexer, "dtype", None) == bool: indexer = np.where(indexer)[0] data = self._take_with_is_copy(indexer, axis=1) if is_single_key: # What does looking for a single key in a non-unique index return? # The behavior is inconsistent. It returns a Series, except when # - the key itself is repeated (test on data.shape, #9519), or # - we have a MultiIndex on columns (test on self.columns, #21309) if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): # GH#26490 using data[key] can cause RecursionError data = data._get_item_cache(key) return data def _getitem_bool_array(self, key): # also raises Exception if object array with NA values # warning here just in case -- previously __setitem__ was # reindexing but __getitem__ was not; it seems more reasonable to # go with the __setitem__ behavior since that is more consistent # with all other indexing behavior if isinstance(key, Series) and not key.index.equals(self.index): warnings.warn( "Boolean Series key will be reindexed to match DataFrame index.", UserWarning, stacklevel=3, ) elif len(key) != len(self.index): raise ValueError( f"Item wrong length {len(key)} instead of {len(self.index)}." ) # check_bool_indexer will throw exception if Series key cannot # be reindexed to match DataFrame rows key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] return self._take_with_is_copy(indexer, axis=0) def _getitem_multilevel(self, key): # self.columns is a MultiIndex loc = self.columns.get_loc(key) if isinstance(loc, (slice, np.ndarray)): new_columns = self.columns[loc] result_columns = maybe_droplevels(new_columns, key) if self._is_mixed_type: result = self.reindex(columns=new_columns) result.columns = result_columns else: new_values = self.values[:, loc] result = self._constructor( new_values, index=self.index, columns=result_columns ) result = result.__finalize__(self) # If there is only one column being returned, and its name is # either an empty string, or a tuple with an empty string as its # first element, then treat the empty string as a placeholder # and return the column as if the user had provided that empty # string in the key. If the result is a Series, exclude the # implied empty string from its name. if len(result.columns) == 1: top = result.columns[0] if isinstance(top, tuple): top = top[0] if top == "": result = result[""] if isinstance(result, Series): result = self._constructor_sliced( result, index=self.index, name=key ) result._set_is_copy(self) return result else: # loc is neither a slice nor ndarray, so must be an int return self._ixs(loc, axis=1) def _get_value(self, index, col, takeable: bool = False): """ Quickly retrieve single value at passed column and index. Parameters ---------- index : row label col : column label takeable : interpret the index/col as indexers, default False Returns ------- scalar """ if takeable: series = self._ixs(col, axis=1) return series._values[index] series = self._get_item_cache(col) engine = self.index._engine try: loc = engine.get_loc(index) return series._values[loc] except KeyError: # GH 20629 if self.index.nlevels > 1: # partial indexing forbidden raise # we cannot handle direct indexing # use positional col = self.columns.get_loc(col) index = self.index.get_loc(index) return self._get_value(index, col, takeable=True) def __setitem__(self, key, value): key = com.apply_if_callable(key, self) # see if we can slice the rows indexer = convert_to_index_sliceable(self, key) if indexer is not None: # either we have a slice or we have a string that can be converted # to a slice for partial-string date indexing return self._setitem_slice(indexer, value) if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: self._setitem_frame(key, value) elif isinstance(key, (Series, np.ndarray, list, Index)): self._setitem_array(key, value) else: # set column self._set_item(key, value) def _setitem_slice(self, key: slice, value): # NB: we can't just use self.loc[key] = value because that # operates on labels and we need to operate positional for # backwards-compat, xref GH#31469 self._check_setitem_copy() self.iloc[key] = value def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): if len(key) != len(self.index): raise ValueError( f"Item wrong length {len(key)} instead of {len(self.index)}!" ) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] self._check_setitem_copy() if isinstance(value, DataFrame): # GH#39931 reindex since iloc does not align value = value.reindex(self.index.take(indexer)) self.iloc[indexer] = value else: if isinstance(value, DataFrame): if len(value.columns) != len(key): raise ValueError("Columns must be same length as key") for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: self.loc._ensure_listlike_indexer(key, axis=1, value=value) indexer = self.loc._get_listlike_indexer( key, axis=1, raise_missing=False )[1] self._check_setitem_copy() self.iloc[:, indexer] = value def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. # df[df > df2] = 0 if isinstance(key, np.ndarray): if key.shape != self.shape: raise ValueError("Array conditional must be same shape as self") key = self._constructor(key, **self._construct_axes_dict()) if key.size and not is_bool_dtype(key.values): raise TypeError( "Must pass DataFrame or 2-d ndarray with boolean values only" ) self._check_inplace_setting(value) self._check_setitem_copy() self._where(-key, value, inplace=True) def _iset_item(self, loc: int, value): self._ensure_valid_index(value) # technically _sanitize_column expects a label, not a position, # but the behavior is the same as long as we pass broadcast=False value = self._sanitize_column(loc, value, broadcast=False) NDFrame._iset_item(self, loc, value) # check if we are modifying a copy # try to set first as we want an invalid # value exception to occur first if len(self): self._check_setitem_copy() def _set_item(self, key, value): """ Add series to DataFrame in specified column. If series is a numpy-array (not a Series/TimeSeries), it must be the same length as the DataFrames index or an error will be thrown. Series/TimeSeries will be conformed to the DataFrames index to ensure homogeneity. """ self._ensure_valid_index(value) value = self._sanitize_column(key, value) NDFrame._set_item(self, key, value) # check if we are modifying a copy # try to set first as we want an invalid # value exception to occur first if len(self): self._check_setitem_copy() def _set_value(self, index, col, value, takeable: bool = False): """ Put single value at passed column and index. Parameters ---------- index : row label col : column label value : scalar takeable : interpret the index/col as indexers, default False """ try: if takeable is True: series = self._ixs(col, axis=1) series._set_value(index, value, takeable=True) return series = self._get_item_cache(col) engine = self.index._engine loc = engine.get_loc(index) validate_numeric_casting(series.dtype, value) series._values[loc] = value # Note: trying to use series._set_value breaks tests in # tests.frame.indexing.test_indexing and tests.indexing.test_partial except (KeyError, TypeError): # set using a non-recursive method & reset the cache if takeable: self.iloc[index, col] = value else: self.loc[index, col] = value self._item_cache.pop(col, None) def _ensure_valid_index(self, value): """ Ensure that if we don't have an index, that we can create one from the passed value. """ # GH5632, make sure that we are a Series convertible if not len(self.index) and is_list_like(value) and len(value): try: value = Series(value) except (ValueError, NotImplementedError, TypeError) as err: raise ValueError( "Cannot set a frame with no defined index " "and a value that cannot be converted to a Series" ) from err # GH31368 preserve name of index index_copy = value.index.copy() if self.index.name is not None: index_copy.name = self.index.name self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) def _box_col_values(self, values, loc: int) -> Series: """ Provide boxed values for a column. """ # Lookup in columns so that if e.g. a str datetime was passed # we attach the Timestamp object as the name. name = self.columns[loc] klass = self._constructor_sliced return klass(values, index=self.index, name=name, fastpath=True) # ---------------------------------------------------------------------- # Unsorted
[docs] def query(self, expr, inplace=False, **kwargs): """ Query the columns of a DataFrame with a boolean expression. Parameters ---------- expr : str The query string to evaluate. You can refer to variables in the environment by prefixing them with an '@' character like ``@a + b``. You can refer to column names that are not valid Python variable names by surrounding them in backticks. Thus, column names containing spaces or punctuations (besides underscores) or starting with digits must be surrounded by backticks. (For example, a column named "Area (cm^2) would be referenced as `Area (cm^2)`). Column names which are Python keywords (like "list", "for", "import", etc) cannot be used. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. .. versionadded:: 0.25.0 Backtick quoting introduced. .. versionadded:: 1.0.0 Expanding functionality of backtick quoting for more than only spaces. inplace : bool Whether the query should modify the data in place or return a modified copy. **kwargs See the documentation for :func:`eval` for complete details on the keyword arguments accepted by :meth:`DataFrame.query`. Returns ------- DataFrame or None DataFrame resulting from the provided query expression or None if ``inplace=True``. See Also -------- eval : Evaluate a string describing operations on DataFrame columns. DataFrame.eval : Evaluate a string describing operations on DataFrame columns. Notes ----- The result of the evaluation of this expression is first passed to :attr:`DataFrame.loc` and if that fails because of a multidimensional key (e.g., a DataFrame) then the result will be passed to :meth:`DataFrame.__getitem__`. This method uses the top-level :func:`eval` function to evaluate the passed query. The :meth:`~pandas.DataFrame.query` method uses a slightly modified Python syntax by default. For example, the ``&`` and ``|`` (bitwise) operators have the precedence of their boolean cousins, :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, however the semantics are different. You can change the semantics of the expression by passing the keyword argument ``parser='python'``. This enforces the same semantics as evaluation in Python space. Likewise, you can pass ``engine='python'`` to evaluate an expression using Python itself as a backend. This is not recommended as it is inefficient compared to using ``numexpr`` as the engine. The :attr:`DataFrame.index` and :attr:`DataFrame.columns` attributes of the :class:`~pandas.DataFrame` instance are placed in the query namespace by default, which allows you to treat both the index and columns of the frame as a column in the frame. The identifier ``index`` is used for the frame index; you can also use the name of the index to identify it in a query. Please note that Python keywords may not be used as identifiers. For further details and examples see the ``query`` documentation in :ref:`indexing <indexing.query>`. *Backtick quoted variables* Backtick quoted variables are parsed as literal Python code and are converted internally to a Python valid identifier. This can lead to the following problems. During parsing a number of disallowed characters inside the backtick quoted string are replaced by strings that are allowed as a Python identifier. These characters include all operators in Python, the space character, the question mark, the exclamation mark, the dollar sign, and the euro sign. For other characters that fall outside the ASCII range (U+0001..U+007F) and those that are not further specified in PEP 3131, the query parser will raise an error. This excludes whitespace different than the space character, but also the hashtag (as it is used for comments) and the backtick itself (backtick can also not be escaped). In a special case, quotes that make a pair around a backtick can confuse the parser. For example, ```it's` > `that's``` will raise an error, as it forms a quoted string (``'s > `that'``) with a backtick inside. See also the Python documentation about lexical analysis (https://docs.python.org/3/reference/lexical_analysis.html) in combination with the source code in :mod:`pandas.core.computation.parsing`. Examples -------- >>> df = pd.DataFrame({'A': range(1, 6), ... 'B': range(10, 0, -2), ... 'C C': range(10, 5, -1)}) >>> df A B C C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 >>> df.query('A > B') A B C C 4 5 2 6 The previous expression is equivalent to >>> df[df.A > df.B] A B C C 4 5 2 6 For columns with spaces in their name, you can use backtick quoting. >>> df.query('B == `C C`') A B C C 0 1 10 10 The previous expression is equivalent to >>> df[df.B == df['C C']] A B C C 0 1 10 10 """ inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(expr, str): msg = f"expr must be a string to be evaluated, {type(expr)} given" raise ValueError(msg) kwargs["level"] = kwargs.pop("level", 0) + 1 kwargs["target"] = None res = self.eval(expr, **kwargs) try: result = self.loc[res] except ValueError: # when res is multi-dimensional loc raises, but this is sometimes a # valid query result = self[res] if inplace: self._update_inplace(result) else: return result
[docs] def eval(self, expr, inplace=False, **kwargs): """ Evaluate a string describing operations on DataFrame columns. Operates on columns only, not specific rows or elements. This allows `eval` to run arbitrary code, which can make you vulnerable to code injection if you pass user input to this function. Parameters ---------- expr : str The expression string to evaluate. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, a new DataFrame is returned. **kwargs See the documentation for :func:`eval` for complete details on the keyword arguments accepted by :meth:`~pandas.DataFrame.query`. Returns ------- ndarray, scalar, pandas object, or None The result of the evaluation or None if ``inplace=True``. See Also -------- DataFrame.query : Evaluates a boolean expression to query the columns of a frame. DataFrame.assign : Can evaluate an expression or function to create new values for a column. eval : Evaluate a Python expression as a string using various backends. Notes ----- For more details see the API documentation for :func:`~eval`. For detailed examples see :ref:`enhancing performance with eval <enhancingperf.eval>`. Examples -------- >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df A B 0 1 10 1 2 8 2 3 6 3 4 4 4 5 2 >>> df.eval('A + B') 0 11 1 10 2 9 3 8 4 7 dtype: int64 Assignment is allowed though by default the original DataFrame is not modified. >>> df.eval('C = A + B') A B C 0 1 10 11 1 2 8 10 2 3 6 9 3 4 4 8 4 5 2 7 >>> df A B 0 1 10 1 2 8 2 3 6 3 4 4 4 5 2 Use ``inplace=True`` to modify the original DataFrame. >>> df.eval('C = A + B', inplace=True) >>> df A B C 0 1 10 11 1 2 8 10 2 3 6 9 3 4 4 8 4 5 2 7 Multiple columns can be assigned to using multi-line expressions: >>> df.eval( ... ''' ... C = A + B ... D = A - B ... ''' ... ) A B C D 0 1 10 11 -9 1 2 8 10 -6 2 3 6 9 -3 3 4 4 8 0 4 5 2 7 3 """ from pandas.core.computation.eval import eval as _eval inplace = validate_bool_kwarg(inplace, "inplace") resolvers = kwargs.pop("resolvers", None) kwargs["level"] = kwargs.pop("level", 0) + 1 if resolvers is None: index_resolvers = self._get_index_resolvers() column_resolvers = self._get_cleaned_column_resolvers() resolvers = column_resolvers, index_resolvers if "target" not in kwargs: kwargs["target"] = self kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) return _eval(expr, inplace=inplace, **kwargs)
[docs] def select_dtypes(self, include=None, exclude=None) -> DataFrame: """ Return a subset of the DataFrame's columns based on the column dtypes. Parameters ---------- include, exclude : scalar or list-like A selection of dtypes or strings to be included/excluded. At least one of these parameters must be supplied. Returns ------- DataFrame The subset of the frame including the dtypes in ``include`` and excluding the dtypes in ``exclude``. Raises ------ ValueError * If both of ``include`` and ``exclude`` are empty * If ``include`` and ``exclude`` have overlapping elements * If any kind of string dtype is passed in. See Also -------- DataFrame.dtypes: Return Series with the data type of each column. Notes ----- * To select all *numeric* types, use ``np.number`` or ``'number'`` * To select strings you must use the ``object`` dtype, but note that this will return *all* object dtype columns * See the `numpy dtype hierarchy <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__ * To select datetimes, use ``np.datetime64``, ``'datetime'`` or ``'datetime64'`` * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or ``'timedelta64'`` * To select Pandas categorical dtypes, use ``'category'`` * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in 0.20.0) or ``'datetime64[ns, tz]'`` Examples -------- >>> df = pd.DataFrame({'a': [1, 2] * 3, ... 'b': [True, False] * 3, ... 'c': [1.0, 2.0] * 3}) >>> df a b c 0 1 True 1.0 1 2 False 2.0 2 1 True 1.0 3 2 False 2.0 4 1 True 1.0 5 2 False 2.0 >>> df.select_dtypes(include='bool') b 0 True 1 False 2 True 3 False 4 True 5 False >>> df.select_dtypes(include=['float64']) c 0 1.0 1 2.0 2 1.0 3 2.0 4 1.0 5 2.0 >>> df.select_dtypes(exclude=['int64']) b c 0 True 1.0 1 False 2.0 2 True 1.0 3 False 2.0 4 True 1.0 5 False 2.0 """ if not is_list_like(include): include = (include,) if include is not None else () if not is_list_like(exclude): exclude = (exclude,) if exclude is not None else () selection = (frozenset(include), frozenset(exclude)) if not any(selection): raise ValueError("at least one of include or exclude must be nonempty") # convert the myriad valid dtypes object to a single representation include = frozenset(infer_dtype_from_object(x) for x in include) exclude = frozenset(infer_dtype_from_object(x) for x in exclude) for dtypes in (include, exclude): invalidate_string_dtypes(dtypes) # can't both include AND exclude! if not include.isdisjoint(exclude): raise ValueError(f"include and exclude overlap on {(include & exclude)}") # We raise when both include and exclude are empty # Hence, we can just shrink the columns we want to keep keep_these = np.full(self.shape[1], True) def extract_unique_dtypes_from_dtypes_set( dtypes_set: FrozenSet[Dtype], unique_dtypes: np.ndarray ) -> List[Dtype]: extracted_dtypes = [ unique_dtype for unique_dtype in unique_dtypes # error: Argument 1 to "tuple" has incompatible type # "FrozenSet[Union[ExtensionDtype, str, Any, Type[str], # Type[float], Type[int], Type[complex], Type[bool]]]"; # expected "Iterable[Union[type, Tuple[Any, ...]]]" if issubclass( unique_dtype.type, tuple(dtypes_set) # type: ignore[arg-type] ) ] return extracted_dtypes unique_dtypes = self.dtypes.unique() if include: included_dtypes = extract_unique_dtypes_from_dtypes_set( include, unique_dtypes ) keep_these &= self.dtypes.isin(included_dtypes) if exclude: excluded_dtypes = extract_unique_dtypes_from_dtypes_set( exclude, unique_dtypes ) keep_these &= ~self.dtypes.isin(excluded_dtypes) return self.iloc[:, keep_these.values]
[docs] def insert(self, loc, column, value, allow_duplicates=False) -> None: """ Insert column into DataFrame at specified location. Raises a ValueError if `column` is already contained in the DataFrame, unless `allow_duplicates` is set to True. Parameters ---------- loc : int Insertion index. Must verify 0 <= loc <= len(columns). column : str, number, or hashable object Label of the inserted column. value : int, Series, or array-like allow_duplicates : bool, optional """ if allow_duplicates and not self.flags.allows_duplicate_labels: raise ValueError( "Cannot specify 'allow_duplicates=True' when " "'self.flags.allows_duplicate_labels' is False." ) self._ensure_valid_index(value) value = self._sanitize_column(column, value, broadcast=False) self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates)
[docs] def assign(self, **kwargs) -> DataFrame: r""" Assign new columns to a DataFrame. Returns a new object with all original columns in addition to new ones. Existing columns that are re-assigned will be overwritten. Parameters ---------- **kwargs : dict of {str: callable or Series} The column names are keywords. If the values are callable, they are computed on the DataFrame and assigned to the new columns. The callable must not change input DataFrame (though pandas doesn't check it). If the values are not callable, (e.g. a Series, scalar, or array), they are simply assigned. Returns ------- DataFrame A new DataFrame with the new columns in addition to all the existing columns. Notes ----- Assigning multiple columns within the same ``assign`` is possible. Later items in '\*\*kwargs' may refer to newly created or modified columns in 'df'; items are computed and assigned into 'df' in order. Examples -------- >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, ... index=['Portland', 'Berkeley']) >>> df temp_c Portland 17.0 Berkeley 25.0 Where the value is a callable, evaluated on `df`: >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) temp_c temp_f Portland 17.0 62.6 Berkeley 25.0 77.0 Alternatively, the same behavior can be achieved by directly referencing an existing Series or sequence: >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) temp_c temp_f Portland 17.0 62.6 Berkeley 25.0 77.0 You can create multiple columns within the same assign where one of the columns depends on another one defined within the same assign: >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) temp_c temp_f temp_k Portland 17.0 62.6 290.15 Berkeley 25.0 77.0 298.15 """ data = self.copy() for k, v in kwargs.items(): data[k] = com.apply_if_callable(v, data) return data
def _sanitize_column(self, key, value, broadcast=True): """ Ensures new columns (which go into the BlockManager as new blocks) are always copied and converted into an array. Parameters ---------- key : object value : scalar, Series, or array-like broadcast : bool, default True If ``key`` matches multiple duplicate column names in the DataFrame, this parameter indicates whether ``value`` should be tiled so that the returned array contains a (duplicated) column for each occurrence of the key. If False, ``value`` will not be tiled. Returns ------- numpy.ndarray """ def reindexer(value): # reindex if necessary if value.index.equals(self.index) or not len(self.index): value = value._values.copy() else: # GH 4107 try: value = value.reindex(self.index)._values except ValueError as err: # raised in MultiIndex.from_tuples, see test_insert_error_msmgs if not value.index.is_unique: # duplicate axis raise err # other raise TypeError( "incompatible index of inserted column with frame index" ) from err return value if isinstance(value, Series): value = reindexer(value) elif isinstance(value, DataFrame): # align right-hand-side columns if self.columns # is multi-index and self[key] is a sub-frame if isinstance(self.columns, MultiIndex) and key in self.columns: loc = self.columns.get_loc(key) if isinstance(loc, (slice, Series, np.ndarray, Index)): cols = maybe_droplevels(self.columns[loc], key) if len(cols) and not cols.equals(value.columns): value = value.reindex(cols, axis=1) # now align rows value = reindexer(value).T elif isinstance(value, ExtensionArray): # Explicitly copy here, instead of in sanitize_index, # as sanitize_index won't copy an EA, even with copy=True value = value.copy() value = sanitize_index(value, self.index) elif isinstance(value, Index) or is_sequence(value): # turn me into an ndarray value = sanitize_index(value, self.index) if not isinstance(value, (np.ndarray, Index)): if isinstance(value, list) and len(value) > 0: value = maybe_convert_platform(value) else: value = com.asarray_tuplesafe(value) elif value.ndim == 2: value = value.copy().T elif isinstance(value, Index): value = value.copy(deep=True) else: value = value.copy() # possibly infer to datetimelike if is_object_dtype(value.dtype): value = maybe_infer_to_datetimelike(value) else: # cast ignores pandas dtypes. so save the dtype first infer_dtype, fill_value = infer_dtype_from_scalar(value, pandas_dtype=True) value = construct_1d_arraylike_from_scalar( fill_value, len(self), infer_dtype ) value = maybe_cast_to_datetime(value, infer_dtype) # return internal types directly if is_extension_array_dtype(value): return value # broadcast across multiple columns if necessary if broadcast and key in self.columns and value.ndim == 1: if not self.columns.is_unique or isinstance(self.columns, MultiIndex): existing_piece = self[key] if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)) return np.atleast_2d(np.asarray(value)) @property def _series(self): return { item: Series( self._mgr.iget(idx), index=self.index, name=item, fastpath=True ) for idx, item in enumerate(self.columns) }
[docs] def lookup(self, row_labels, col_labels) -> np.ndarray: """ Label-based "fancy indexing" function for DataFrame. Given equal-length arrays of row and column labels, return an array of the values corresponding to each (row, col) pair. .. deprecated:: 1.2.0 DataFrame.lookup is deprecated, use DataFrame.melt and DataFrame.loc instead. For further details see :ref:`Looking up values by index/column labels <indexing.lookup>`. Parameters ---------- row_labels : sequence The row labels to use for lookup. col_labels : sequence The column labels to use for lookup. Returns ------- numpy.ndarray The found values. """ msg = ( "The 'lookup' method is deprecated and will be" "removed in a future version." "You can use DataFrame.melt and DataFrame.loc" "as a substitute." ) warnings.warn(msg, FutureWarning, stacklevel=2) n = len(row_labels) if n != len(col_labels): raise ValueError("Row labels must have same size as column labels") if not (self.index.is_unique and self.columns.is_unique): # GH#33041 raise ValueError("DataFrame.lookup requires unique index and columns") thresh = 1000 if not self._is_mixed_type or n > thresh: values = self.values ridx = self.index.get_indexer(row_labels) cidx = self.columns.get_indexer(col_labels) if (ridx == -1).any(): raise KeyError("One or more row labels was not found") if (cidx == -1).any(): raise KeyError("One or more column labels was not found") flat_index = ridx * len(self.columns) + cidx result = values.flat[flat_index] else: result = np.empty(n, dtype="O") for i, (r, c) in enumerate(zip(row_labels, col_labels)): result[i] = self._get_value(r, c) if is_object_dtype(result): result = lib.maybe_convert_objects(result) return result
# ---------------------------------------------------------------------- # Reindexing and alignment def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): frame = self columns = axes["columns"] if columns is not None: frame = frame._reindex_columns( columns, method, copy, level, fill_value, limit, tolerance ) index = axes["index"] if index is not None: frame = frame._reindex_index( index, method, copy, level, fill_value, limit, tolerance ) return frame def _reindex_index( self, new_index, method, copy, level, fill_value=np.nan, limit=None, tolerance=None, ): new_index, indexer = self.index.reindex( new_index, method=method, level=level, limit=limit, tolerance=tolerance ) return self._reindex_with_indexers( {0: [new_index, indexer]}, copy=copy, fill_value=fill_value, allow_dups=False, ) def _reindex_columns( self, new_columns, method, copy, level, fill_value=None, limit=None, tolerance=None, ): new_columns, indexer = self.columns.reindex( new_columns, method=method, level=level, limit=limit, tolerance=tolerance ) return self._reindex_with_indexers( {1: [new_columns, indexer]}, copy=copy, fill_value=fill_value, allow_dups=False, ) def _reindex_multi(self, axes, copy, fill_value) -> DataFrame: """ We are guaranteed non-Nones in the axes. """ new_index, row_indexer = self.index.reindex(axes["index"]) new_columns, col_indexer = self.columns.reindex(axes["columns"]) if row_indexer is not None and col_indexer is not None: indexer = row_indexer, col_indexer new_values = algorithms.take_2d_multi( self.values, indexer, fill_value=fill_value ) return self._constructor(new_values, index=new_index, columns=new_columns) else: return self._reindex_with_indexers( {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, copy=copy, fill_value=fill_value, )
[docs] @doc(NDFrame.align, **_shared_doc_kwargs) def align( self, other, join="outer", axis=None, level=None, copy=True, fill_value=None, method=None, limit=None, fill_axis=0, broadcast_axis=None, ) -> DataFrame: return super().align( other, join=join, axis=axis, level=level, copy=copy, fill_value=fill_value, method=method, limit=limit, fill_axis=fill_axis, broadcast_axis=broadcast_axis, )
[docs] @Appender( """ Examples -------- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) Change the row labels. >>> df.set_axis(['a', 'b', 'c'], axis='index') A B a 1 4 b 2 5 c 3 6 Change the column labels. >>> df.set_axis(['I', 'II'], axis='columns') I II 0 1 4 1 2 5 2 3 6 Now, update the labels inplace. >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True) >>> df i ii 0 1 4 1 2 5 2 3 6 """ ) @Substitution( **_shared_doc_kwargs, extended_summary_sub=" column or", axis_description_sub=", and 1 identifies the columns", see_also_sub=" or columns", ) @Appender(NDFrame.set_axis.__doc__) def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): return super().set_axis(labels, axis=axis, inplace=inplace)
[docs] @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.reindex.__doc__) @rewrite_axis_style_signature( "labels", [ ("method", None), ("copy", True), ("level", None), ("fill_value", np.nan), ("limit", None), ("tolerance", None), ], ) def reindex(self, *args, **kwargs) -> DataFrame: axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names kwargs.pop("axis", None) kwargs.pop("labels", None) return super().reindex(**kwargs)
[docs] def drop( self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors="raise", ): """ Drop specified labels from rows or columns. Remove rows or columns by specifying label names and corresponding axis, or by specifying directly index or column names. When using a multi-index, labels on different levels can be removed by specifying the level. Parameters ---------- labels : single label or list-like Index or column labels to drop. axis : {0 or 'index', 1 or 'columns'}, default 0 Whether to drop labels from the index (0 or 'index') or columns (1 or 'columns'). index : single label or list-like Alternative to specifying axis (``labels, axis=0`` is equivalent to ``index=labels``). columns : single label or list-like Alternative to specifying axis (``labels, axis=1`` is equivalent to ``columns=labels``). level : int or level name, optional For MultiIndex, level from which the labels will be removed. inplace : bool, default False If False, return a copy. Otherwise, do operation inplace and return None. errors : {'ignore', 'raise'}, default 'raise' If 'ignore', suppress error and only existing labels are dropped. Returns ------- DataFrame or None DataFrame without the removed index or column labels or None if ``inplace=True``. Raises ------ KeyError If any of the labels is not found in the selected axis. See Also -------- DataFrame.loc : Label-location based indexer for selection by label. DataFrame.dropna : Return DataFrame with labels on given axis omitted where (all or any) data are missing. DataFrame.drop_duplicates : Return DataFrame with duplicate rows removed, optionally only considering certain columns. Series.drop : Return Series with specified index labels removed. Examples -------- >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), ... columns=['A', 'B', 'C', 'D']) >>> df A B C D 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 Drop columns >>> df.drop(['B', 'C'], axis=1) A D 0 0 3 1 4 7 2 8 11 >>> df.drop(columns=['B', 'C']) A D 0 0 3 1 4 7 2 8 11 Drop a row by index >>> df.drop([0, 1]) A B C D 2 8 9 10 11 Drop columns and/or rows of MultiIndex DataFrame >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], ... [250, 150], [1.5, 0.8], [320, 250], ... [1, 0.8], [0.3, 0.2]]) >>> df big small lama speed 45.0 30.0 weight 200.0 100.0 length 1.5 1.0 cow speed 30.0 20.0 weight 250.0 150.0 length 1.5 0.8 falcon speed 320.0 250.0 weight 1.0 0.8 length 0.3 0.2 >>> df.drop(index='cow', columns='small') big lama speed 45.0 weight 200.0 length 1.5 falcon speed 320.0 weight 1.0 length 0.3 >>> df.drop(index='length', level=1) big small lama speed 45.0 30.0 weight 200.0 100.0 cow speed 30.0 20.0 weight 250.0 150.0 falcon speed 320.0 250.0 weight 1.0 0.8 """ return super().drop( labels=labels, axis=axis, index=index, columns=columns, level=level, inplace=inplace, errors=errors, )
[docs] @rewrite_axis_style_signature( "mapper", [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")], ) def rename( self, mapper: Optional[Renamer] = None, *, index: Optional[Renamer] = None, columns: Optional[Renamer] = None, axis: Optional[Axis] = None, copy: bool = True, inplace: bool = False, level: Optional[Level] = None, errors: str = "ignore", ) -> Optional[DataFrame]: """ Alter axes labels. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left as-is. Extra labels listed don't throw an error. See the :ref:`user guide <basics.rename>` for more. Parameters ---------- mapper : dict-like or function Dict-like or function transformations to apply to that axis' values. Use either ``mapper`` and ``axis`` to specify the axis to target with ``mapper``, or ``index`` and ``columns``. index : dict-like or function Alternative to specifying axis (``mapper, axis=0`` is equivalent to ``index=mapper``). columns : dict-like or function Alternative to specifying axis (``mapper, axis=1`` is equivalent to ``columns=mapper``). axis : {0 or 'index', 1 or 'columns'}, default 0 Axis to target with ``mapper``. Can be either the axis name ('index', 'columns') or number (0, 1). The default is 'index'. copy : bool, default True Also copy underlying data. inplace : bool, default False Whether to return a new DataFrame. If True then value of copy is ignored. level : int or level name, default None In case of a MultiIndex, only rename labels in the specified level. errors : {'ignore', 'raise'}, default 'ignore' If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, or `columns` contains labels that are not present in the Index being transformed. If 'ignore', existing keys will be renamed and extra keys will be ignored. Returns ------- DataFrame or None DataFrame with the renamed axis labels or None if ``inplace=True``. Raises ------ KeyError If any of the labels is not found in the selected axis and "errors='raise'". See Also -------- DataFrame.rename_axis : Set the name of the axis. Examples -------- ``DataFrame.rename`` supports two calling conventions * ``(index=index_mapper, columns=columns_mapper, ...)`` * ``(mapper, axis={'index', 'columns'}, ...)`` We *highly* recommend using keyword arguments to clarify your intent. Rename columns using a mapping: >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> df.rename(columns={"A": "a", "B": "c"}) a c 0 1 4 1 2 5 2 3 6 Rename index using a mapping: >>> df.rename(index={0: "x", 1: "y", 2: "z"}) A B x 1 4 y 2 5 z 3 6 Cast index labels to a different type: >>> df.index RangeIndex(start=0, stop=3, step=1) >>> df.rename(index=str).index Index(['0', '1', '2'], dtype='object') >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") Traceback (most recent call last): KeyError: ['C'] not found in axis Using axis-style parameters: >>> df.rename(str.lower, axis='columns') a b 0 1 4 1 2 5 2 3 6 >>> df.rename({1: 2, 2: 4}, axis='index') A B 0 1 4 2 2 5 4 3 6 """ return super().rename( mapper=mapper, index=index, columns=columns, axis=axis, copy=copy, inplace=inplace, level=level, errors=errors, )
[docs] @doc(NDFrame.fillna, **_shared_doc_kwargs) def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, ) -> Optional[DataFrame]: return super().fillna( value=value, method=method, axis=axis, inplace=inplace, limit=limit, downcast=downcast, )
[docs] def pop(self, item: Label) -> Series: """ Return item and drop from frame. Raise KeyError if not found. Parameters ---------- item : label Label of column to be popped. Returns ------- Series Examples -------- >>> df = pd.DataFrame([('falcon', 'bird', 389.0), ... ('parrot', 'bird', 24.0), ... ('lion', 'mammal', 80.5), ... ('monkey', 'mammal', np.nan)], ... columns=('name', 'class', 'max_speed')) >>> df name class max_speed 0 falcon bird 389.0 1 parrot bird 24.0 2 lion mammal 80.5 3 monkey mammal NaN >>> df.pop('class') 0 bird 1 bird 2 mammal 3 mammal Name: class, dtype: object >>> df name max_speed 0 falcon 389.0 1 parrot 24.0 2 lion 80.5 3 monkey NaN """ return super().pop(item=item)
[docs] @doc(NDFrame.replace, **_shared_doc_kwargs) def replace( self, to_replace=None, value=None, inplace=False, limit=None, regex=False, method="pad", ): return super().replace( to_replace=to_replace, value=value, inplace=inplace, limit=limit, regex=regex, method=method, )
def _replace_columnwise( self, mapping: Dict[Label, Tuple[Any, Any]], inplace: bool, regex ): """ Dispatch to Series.replace column-wise. Parameters ---------- mapping : dict of the form {col: (target, value)} inplace : bool regex : bool or same types as `to_replace` in DataFrame.replace Returns ------- DataFrame or None """ # Operate column-wise res = self if inplace else self.copy() ax = self.columns for i in range(len(ax)): if ax[i] in mapping: ser = self.iloc[:, i] target, value = mapping[ax[i]] newobj = ser.replace(target, value, regex=regex) res.iloc[:, i] = newobj if inplace: return return res.__finalize__(self)
[docs] @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift( self, periods=1, freq=None, axis=0, fill_value=lib.no_default ) -> DataFrame: axis = self._get_axis_number(axis) ncols = len(self.columns) if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: # We will infer fill_value to match the closest column # Use a column that we know is valid for our column's dtype GH#38434 label = self.columns[0] if periods > 0: result = self.iloc[:, :-periods] for col in range(min(ncols, abs(periods))): # TODO(EA2D): doing this in a loop unnecessary with 2D EAs # Define filler inside loop so we get a copy filler = self.iloc[:, 0].shift(len(self)) result.insert(0, label, filler, allow_duplicates=True) else: result = self.iloc[:, -periods:] for col in range(min(ncols, abs(periods))): # Define filler inside loop so we get a copy filler = self.iloc[:, -1].shift(len(self)) result.insert( len(result.columns), label, filler, allow_duplicates=True ) result.columns = self.columns.copy() return result return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value )
[docs] def set_index( self, keys, drop=True, append=False, inplace=False, verify_integrity=False ): """ Set the DataFrame index using existing columns. Set the DataFrame index (row labels) using one or more existing columns or arrays (of the correct length). The index can replace the existing index or expand on it. Parameters ---------- keys : label or array-like or list of labels/arrays This parameter can be either a single column key, a single array of the same length as the calling DataFrame, or a list containing an arbitrary combination of column keys and arrays. Here, "array" encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and instances of :class:`~collections.abc.Iterator`. drop : bool, default True Delete columns to be used as the new index. append : bool, default False Whether to append columns to existing index. inplace : bool, default False If True, modifies the DataFrame in place (do not create a new object). verify_integrity : bool, default False Check the new index for duplicates. Otherwise defer the check until necessary. Setting to False will improve the performance of this method. Returns ------- DataFrame or None Changed row labels or None if ``inplace=True``. See Also -------- DataFrame.reset_index : Opposite of set_index. DataFrame.reindex : Change to new indices or expand indices. DataFrame.reindex_like : Change to same indices as other DataFrame. Examples -------- >>> df = pd.DataFrame({'month': [1, 4, 7, 10], ... 'year': [2012, 2014, 2013, 2014], ... 'sale': [55, 40, 84, 31]}) >>> df month year sale 0 1 2012 55 1 4 2014 40 2 7 2013 84 3 10 2014 31 Set the index to become the 'month' column: >>> df.set_index('month') year sale month 1 2012 55 4 2014 40 7 2013 84 10 2014 31 Create a MultiIndex using columns 'year' and 'month': >>> df.set_index(['year', 'month']) sale year month 2012 1 55 2014 4 40 2013 7 84 2014 10 31 Create a MultiIndex using an Index and a column: >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) month sale year 1 2012 1 55 2 2014 4 40 3 2013 7 84 4 2014 10 31 Create a MultiIndex using two Series: >>> s = pd.Series([1, 2, 3, 4]) >>> df.set_index([s, s**2]) month year sale 1 1 1 2012 55 2 4 4 2014 40 3 9 7 2013 84 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, "inplace") self._check_inplace_and_allows_duplicate_labels(inplace) if not isinstance(keys, list): keys = [keys] err_msg = ( 'The parameter "keys" may be a column key, one-dimensional ' "array, or a list containing only valid column keys and " "one-dimensional arrays." ) missing: List[Label] = [] for col in keys: if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)): # arrays are fine as long as they are one-dimensional # iterators get converted to list below if getattr(col, "ndim", 1) != 1: raise ValueError(err_msg) else: # everything else gets tried as a key; see GH 24969 try: found = col in self.columns except TypeError as err: raise TypeError( f"{err_msg}. Received column of type {type(col)}" ) from err else: if not found: missing.append(col) if missing: raise KeyError(f"None of {missing} are in the columns") if inplace: frame = self else: frame = self.copy() arrays = [] names: List[Label] = [] if append: names = list(self.index.names) if isinstance(self.index, MultiIndex): for i in range(self.index.nlevels): arrays.append(self.index._get_level_values(i)) else: arrays.append(self.index) to_remove: List[Label] = [] for col in keys: if isinstance(col, MultiIndex): for n in range(col.nlevels): arrays.append(col._get_level_values(n)) names.extend(col.names) elif isinstance(col, (Index, Series)): # if Index then not MultiIndex (treated above) arrays.append(col) names.append(col.name) elif isinstance(col, (list, np.ndarray)): arrays.append(col) names.append(None) elif isinstance(col, abc.Iterator): arrays.append(list(col)) names.append(None) # from here, col can only be a column label else: arrays.append(frame[col]._values) names.append(col) if drop: to_remove.append(col) if len(arrays[-1]) != len(self): # check newest element against length of calling frame, since # ensure_index_from_sequences would not raise for append=False. raise ValueError( f"Length mismatch: Expected {len(self)} rows, " f"received array of length {len(arrays[-1])}" ) index = ensure_index_from_sequences(arrays, names) if verify_integrity and not index.is_unique: duplicates = index[index.duplicated()].unique() raise ValueError(f"Index has duplicate keys: {duplicates}") # use set to handle duplicate column names gracefully in case of drop for c in set(to_remove): del frame[c] # clear up memory usage index._cleanup() frame.index = index if not inplace: return frame
@overload # https://github.com/python/mypy/issues/6580 # Overloaded function signatures 1 and 2 overlap with incompatible return types def reset_index( # type: ignore[misc] self, level: Optional[Union[Hashable, Sequence[Hashable]]] = ..., drop: bool = ..., inplace: Literal[False] = ..., col_level: Hashable = ..., col_fill: Label = ..., ) -> DataFrame: ... @overload def reset_index( self, level: Optional[Union[Hashable, Sequence[Hashable]]] = ..., drop: bool = ..., inplace: Literal[True] = ..., col_level: Hashable = ..., col_fill: Label = ..., ) -> None: ...
[docs] def reset_index( self, level: Optional[Union[Hashable, Sequence[Hashable]]] = None, drop: bool = False, inplace: bool = False, col_level: Hashable = 0, col_fill: Label = "", ) -> Optional[DataFrame]: """ Reset the index, or a level of it. Reset the index of the DataFrame, and use the default one instead. If the DataFrame has a MultiIndex, this method can remove one or more levels. Parameters ---------- level : int, str, tuple, or list, default None Only remove the given levels from the index. Removes all levels by default. drop : bool, default False Do not try to insert index into dataframe columns. This resets the index to the default integer index. inplace : bool, default False Modify the DataFrame in place (do not create a new object). col_level : int or str, default 0 If the columns have multiple levels, determines which level the labels are inserted into. By default it is inserted into the first level. col_fill : object, default '' If the columns have multiple levels, determines how the other levels are named. If None then the index name is repeated. Returns ------- DataFrame or None DataFrame with the new index or None if ``inplace=True``. See Also -------- DataFrame.set_index : Opposite of reset_index. DataFrame.reindex : Change to new indices or expand indices. DataFrame.reindex_like : Change to same indices as other DataFrame. Examples -------- >>> df = pd.DataFrame([('bird', 389.0), ... ('bird', 24.0), ... ('mammal', 80.5), ... ('mammal', np.nan)], ... index=['falcon', 'parrot', 'lion', 'monkey'], ... columns=('class', 'max_speed')) >>> df class max_speed falcon bird 389.0 parrot bird 24.0 lion mammal 80.5 monkey mammal NaN When we reset the index, the old index is added as a column, and a new sequential index is used: >>> df.reset_index() index class max_speed 0 falcon bird 389.0 1 parrot bird 24.0 2 lion mammal 80.5 3 monkey mammal NaN We can use the `drop` parameter to avoid the old index being added as a column: >>> df.reset_index(drop=True) class max_speed 0 bird 389.0 1 bird 24.0 2 mammal 80.5 3 mammal NaN You can also use `reset_index` with `MultiIndex`. >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ... ('bird', 'parrot'), ... ('mammal', 'lion'), ... ('mammal', 'monkey')], ... names=['class', 'name']) >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), ... ('species', 'type')]) >>> df = pd.DataFrame([(389.0, 'fly'), ... ( 24.0, 'fly'), ... ( 80.5, 'run'), ... (np.nan, 'jump')], ... index=index, ... columns=columns) >>> df speed species max type class name bird falcon 389.0 fly parrot 24.0 fly mammal lion 80.5 run monkey NaN jump If the index has multiple levels, we can reset a subset of them: >>> df.reset_index(level='class') class speed species max type name falcon bird 389.0 fly parrot bird 24.0 fly lion mammal 80.5 run monkey mammal NaN jump If we are not dropping the index, by default, it is placed in the top level. We can place it in another level: >>> df.reset_index(level='class', col_level=1) speed species class max type name falcon bird 389.0 fly parrot bird 24.0 fly lion mammal 80.5 run monkey mammal NaN jump When the index is inserted under another level, we can specify under which one with the parameter `col_fill`: >>> df.reset_index(level='class', col_level=1, col_fill='species') species speed species class max type name falcon bird 389.0 fly parrot bird 24.0 fly lion mammal 80.5 run monkey mammal NaN jump If we specify a nonexistent level for `col_fill`, it is created: >>> df.reset_index(level='class', col_level=1, col_fill='genus') genus speed species class max type name falcon bird 389.0 fly parrot bird 24.0 fly lion mammal 80.5 run monkey mammal NaN jump """ inplace = validate_bool_kwarg(inplace, "inplace") self._check_inplace_and_allows_duplicate_labels(inplace) if inplace: new_obj = self else: new_obj = self.copy() new_index = ibase.default_index(len(new_obj)) if level is not None: if not isinstance(level, (tuple, list)): level = [level] level = [self.index._get_level_number(lev) for lev in level] if len(level) < self.index.nlevels: new_index = self.index.droplevel(level) if not drop: to_insert: Iterable[Tuple[Any, Optional[Any]]] if isinstance(self.index, MultiIndex): names = [ (n if n is not None else f"level_{i}") for i, n in enumerate(self.index.names) ] to_insert = zip(self.index.levels, self.index.codes) else: default = "index" if "index" not in self else "level_0" names = [default] if self.index.name is None else [self.index.name] to_insert = ((self.index, None),) multi_col = isinstance(self.columns, MultiIndex) for i, (lev, lab) in reversed(list(enumerate(to_insert))): if not (level is None or i in level): continue name = names[i] if multi_col: col_name = list(name) if isinstance(name, tuple) else [name] if col_fill is None: if len(col_name) not in (1, self.columns.nlevels): raise ValueError( "col_fill=None is incompatible " f"with incomplete column name {name}" ) col_fill = col_name[0] lev_num = self.columns._get_level_number(col_level) name_lst = [col_fill] * lev_num + col_name missing = self.columns.nlevels - len(name_lst) name_lst += [col_fill] * missing name = tuple(name_lst) # to ndarray and maybe infer different dtype level_values = maybe_casted_values(lev, lab) new_obj.insert(0, name, level_values) new_obj.index = new_index if not inplace: return new_obj return None
# ---------------------------------------------------------------------- # Reindex-based selection methods
[docs] @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isna(self) -> DataFrame: result = self._constructor(self._mgr.isna(func=isna)) return result.__finalize__(self, method="isna")
[docs] @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isnull(self) -> DataFrame: return self.isna()
[docs] @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) def notna(self) -> DataFrame: return ~self.isna()
[docs] @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) def notnull(self) -> DataFrame: return ~self.isna()
[docs] def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): """ Remove missing values. See the :ref:`User Guide <missing_data>` for more on which values are considered missing, and how to work with missing data. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 Determine if rows or columns which contain missing values are removed. * 0, or 'index' : Drop rows which contain missing values. * 1, or 'columns' : Drop columns which contain missing value. .. versionchanged:: 1.0.0 Pass tuple or list to drop on multiple axes. Only a single axis is allowed. how : {'any', 'all'}, default 'any' Determine if row or column is removed from DataFrame, when we have at least one NA or all NA. * 'any' : If any NA values are present, drop that row or column. * 'all' : If all values are NA, drop that row or column. thresh : int, optional Require that many non-NA values. subset : array-like, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. inplace : bool, default False If True, do operation inplace and return None. Returns ------- DataFrame or None DataFrame with NA entries dropped from it or None if ``inplace=True``. See Also -------- DataFrame.isna: Indicate missing values. DataFrame.notna : Indicate existing (non-missing) values. DataFrame.fillna : Replace missing values. Series.dropna : Drop missing values. Index.dropna : Drop missing indices. Examples -------- >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), ... pd.NaT]}) >>> df name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT Drop the rows where at least one element is missing. >>> df.dropna() name toy born 1 Batman Batmobile 1940-04-25 Drop the columns where at least one element is missing. >>> df.dropna(axis='columns') name 0 Alfred 1 Batman 2 Catwoman Drop the rows where all elements are missing. >>> df.dropna(how='all') name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT Keep only the rows with at least 2 non-NA values. >>> df.dropna(thresh=2) name toy born 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT Define in which columns to look for missing values. >>> df.dropna(subset=['name', 'toy']) name toy born 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT Keep the DataFrame with valid entries in the same variable. >>> df.dropna(inplace=True) >>> df name toy born 1 Batman Batmobile 1940-04-25 """ inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(axis, (tuple, list)): # GH20987 raise TypeError("supplying multiple axes to axis is no longer supported.") axis = self._get_axis_number(axis) agg_axis = 1 - axis agg_obj = self if subset is not None: ax = self._get_axis(agg_axis) indices = ax.get_indexer_for(subset) check = indices == -1 if check.any(): raise KeyError(list(np.compress(check, subset))) agg_obj = self.take(indices, axis=agg_axis) count = agg_obj.count(axis=agg_axis) if thresh is not None: mask = count >= thresh elif how == "any": mask = count == len(agg_obj._get_axis(agg_axis)) elif how == "all": mask = count > 0 else: if how is not None: raise ValueError(f"invalid how option: {how}") else: raise TypeError("must specify how or thresh") result = self.loc(axis=axis)[mask] if inplace: self._update_inplace(result) else: return result
[docs] def drop_duplicates( self, subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, keep: Union[str, bool] = "first", inplace: bool = False, ignore_index: bool = False, ) -> Optional[DataFrame]: """ Return DataFrame with duplicate rows removed. Considering certain columns is optional. Indexes, including time indexes are ignored. Parameters ---------- subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : {'first', 'last', False}, default 'first' Determines which duplicates (if any) to keep. - ``first`` : Drop duplicates except for the first occurrence. - ``last`` : Drop duplicates except for the last occurrence. - False : Drop all duplicates. inplace : bool, default False Whether to drop duplicates in place or to return a copy. ignore_index : bool, default False If True, the resulting axis will be labeled 0, 1, …, n - 1. .. versionadded:: 1.0.0 Returns ------- DataFrame or None DataFrame with duplicates removed or None if ``inplace=True``. See Also -------- DataFrame.value_counts: Count unique combinations of columns. Examples -------- Consider dataset containing ramen rating. >>> df = pd.DataFrame({ ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], ... 'rating': [4, 4, 3.5, 15, 5] ... }) >>> df brand style rating 0 Yum Yum cup 4.0 1 Yum Yum cup 4.0 2 Indomie cup 3.5 3 Indomie pack 15.0 4 Indomie pack 5.0 By default, it removes duplicate rows based on all columns. >>> df.drop_duplicates() brand style rating 0 Yum Yum cup 4.0 2 Indomie cup 3.5 3 Indomie pack 15.0 4 Indomie pack 5.0 To remove duplicates on specific column(s), use ``subset``. >>> df.drop_duplicates(subset=['brand']) brand style rating 0 Yum Yum cup 4.0 2 Indomie cup 3.5 To remove duplicates and keep last occurrences, use ``keep``. >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') brand style rating 1 Yum Yum cup 4.0 2 Indomie cup 3.5 4 Indomie pack 5.0 """ if self.empty: return self.copy() inplace = validate_bool_kwarg(inplace, "inplace") ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") duplicated = self.duplicated(subset, keep=keep) result = self[-duplicated] if ignore_index: result.index = ibase.default_index(len(result)) if inplace: self._update_inplace(result) return None else: return result
[docs] def duplicated( self, subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, keep: Union[str, bool] = "first", ) -> Series: """ Return boolean Series denoting duplicate rows. Considering certain columns is optional. Parameters ---------- subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : {'first', 'last', False}, default 'first' Determines which duplicates (if any) to mark. - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. Returns ------- Series Boolean series for each duplicated rows. See Also -------- Index.duplicated : Equivalent method on index. Series.duplicated : Equivalent method on Series. Series.drop_duplicates : Remove duplicate values from Series. DataFrame.drop_duplicates : Remove duplicate values from DataFrame. Examples -------- Consider dataset containing ramen rating. >>> df = pd.DataFrame({ ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], ... 'rating': [4, 4, 3.5, 15, 5] ... }) >>> df brand style rating 0 Yum Yum cup 4.0 1 Yum Yum cup 4.0 2 Indomie cup 3.5 3 Indomie pack 15.0 4 Indomie pack 5.0 By default, for each set of duplicated values, the first occurrence is set on False and all others on True. >>> df.duplicated() 0 False 1 True 2 False 3 False 4 False dtype: bool By using 'last', the last occurrence of each set of duplicated values is set on False and all others on True. >>> df.duplicated(keep='last') 0 True 1 False 2 False 3 False 4 False dtype: bool By setting ``keep`` on False, all duplicates are True. >>> df.duplicated(keep=False) 0 True 1 True 2 False 3 False 4 False dtype: bool To find duplicates on specific column(s), use ``subset``. >>> df.duplicated(subset=['brand']) 0 False 1 True 2 False 3 True 4 True dtype: bool """ from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64 if self.empty: return self._constructor_sliced(dtype=bool) def f(vals): labels, shape = algorithms.factorize( vals, size_hint=min(len(self), SIZE_HINT_LIMIT) ) return labels.astype("i8", copy=False), len(shape) if subset is None: subset = self.columns elif ( not np.iterable(subset) or isinstance(subset, str) or isinstance(subset, tuple) and subset in self.columns ): subset = (subset,) # needed for mypy since can't narrow types using np.iterable subset = cast(Iterable, subset) # Verify all columns in subset exist in the queried dataframe # Otherwise, raise a KeyError, same as if you try to __getitem__ with a # key that doesn't exist. diff = Index(subset).difference(self.columns) if not diff.empty: raise KeyError(diff) vals = (col.values for name, col in self.items() if name in subset) labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) result = self._constructor_sliced(duplicated_int64(ids, keep), index=self.index) return result.__finalize__(self, method="duplicated")
# ---------------------------------------------------------------------- # Sorting # TODO: Just move the sort_values doc here.
[docs] @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) # error: Signature of "sort_values" incompatible with supertype "NDFrame" def sort_values( # type: ignore[override] self, by, axis=0, ascending=True, inplace=False, kind="quicksort", na_position="last", ignore_index=False, key: ValueKeyFunc = None, ): inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) if not isinstance(by, list): by = [by] if is_sequence(ascending) and len(by) != len(ascending): raise ValueError( f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" ) if len(by) > 1: keys = [self._get_label_or_level_values(x, axis=axis) for x in by] # need to rewrap columns in Series to apply key function if key is not None: keys = [Series(k, name=name) for (k, name) in zip(keys, by)] indexer = lexsort_indexer( keys, orders=ascending, na_position=na_position, key=key ) indexer = ensure_platform_int(indexer) else: by = by[0] k = self._get_label_or_level_values(by, axis=axis) # need to rewrap column in Series to apply key function if key is not None: k = Series(k, name=by) if isinstance(ascending, (tuple, list)): ascending = ascending[0] indexer = nargsort( k, kind=kind, ascending=ascending, na_position=na_position, key=key ) new_data = self._mgr.take( indexer, axis=self._get_block_manager_axis(axis), verify=False ) if ignore_index: new_data.axes[1] = ibase.default_index(len(indexer)) result = self._constructor(new_data) if inplace: return self._update_inplace(result) else: return result.__finalize__(self, method="sort_values")
[docs] def sort_index( self, axis=0, level=None, ascending: Union[Union[bool, int], Sequence[Union[bool, int]]] = True, inplace: bool = False, kind: str = "quicksort", na_position: str = "last", sort_remaining: bool = True, ignore_index: bool = False, key: IndexKeyFunc = None, ): """ Sort object by labels (along an axis). Returns a new DataFrame sorted by label if `inplace` argument is ``False``, otherwise updates the original DataFrame and returns None. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 The axis along which to sort. The value 0 identifies the rows, and 1 identifies the columns. level : int or level name or list of ints or list of level names If not None, sort on values in specified index level(s). ascending : bool or list-like of bools, default True Sort ascending vs. descending. When the index is a MultiIndex the sort direction can be controlled for each level individually. inplace : bool, default False If True, perform operation in-place. kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See also ndarray.np.sort for more information. `mergesort` is the only stable algorithm. For DataFrames, this option is only applied when sorting on a single column or label. na_position : {'first', 'last'}, default 'last' Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. Not implemented for MultiIndex. sort_remaining : bool, default True If True and sorting by level and index is multilevel, sort by other levels too (in order) after sorting by specified level. ignore_index : bool, default False If True, the resulting axis will be labeled 0, 1, …, n - 1. .. versionadded:: 1.0.0 key : callable, optional If not None, apply the key function to the index values before sorting. This is similar to the `key` argument in the builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect an ``Index`` and return an ``Index`` of the same shape. For MultiIndex inputs, the key is applied *per level*. .. versionadded:: 1.1.0 Returns ------- DataFrame or None The original DataFrame sorted by the labels or None if ``inplace=True``. See Also -------- Series.sort_index : Sort Series by the index. DataFrame.sort_values : Sort DataFrame by the value. Series.sort_values : Sort Series by the value. Examples -------- >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], ... columns=['A']) >>> df.sort_index() A 1 4 29 2 100 1 150 5 234 3 By default, it sorts in ascending order, to sort in descending order, use ``ascending=False`` >>> df.sort_index(ascending=False) A 234 3 150 5 100 1 29 2 1 4 A key function can be specified which is applied to the index before sorting. For a ``MultiIndex`` this is applied to each level separately. >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) >>> df.sort_index(key=lambda x: x.str.lower()) a A 1 b 2 C 3 d 4 """ return super().sort_index( axis, level, ascending, inplace, kind, na_position, sort_remaining, ignore_index, key, )
[docs] def value_counts( self, subset: Optional[Sequence[Label]] = None, normalize: bool = False, sort: bool = True, ascending: bool = False, ): """ Return a Series containing counts of unique rows in the DataFrame. .. versionadded:: 1.1.0 Parameters ---------- subset : list-like, optional Columns to use when counting unique combinations. normalize : bool, default False Return proportions rather than frequencies. sort : bool, default True Sort by frequencies. ascending : bool, default False Sort in ascending order. Returns ------- Series See Also -------- Series.value_counts: Equivalent method on Series. Notes ----- The returned Series will have a MultiIndex with one level per input column. By default, rows that contain any NA values are omitted from the result. By default, the resulting Series will be in descending order so that the first element is the most frequently-occurring row. Examples -------- >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], ... 'num_wings': [2, 0, 0, 0]}, ... index=['falcon', 'dog', 'cat', 'ant']) >>> df num_legs num_wings falcon 2 2 dog 4 0 cat 4 0 ant 6 0 >>> df.value_counts() num_legs num_wings 4 0 2 2 2 1 6 0 1 dtype: int64 >>> df.value_counts(sort=False) num_legs num_wings 2 2 1 4 0 2 6 0 1 dtype: int64 >>> df.value_counts(ascending=True) num_legs num_wings 2 2 1 6 0 1 4 0 2 dtype: int64 >>> df.value_counts(normalize=True) num_legs num_wings 4 0 0.50 2 2 0.25 6 0 0.25 dtype: float64 """ if subset is None: subset = self.columns.tolist() counts = self.groupby(subset).grouper.size() if sort: counts = counts.sort_values(ascending=ascending) if normalize: counts /= counts.sum() # Force MultiIndex for single column if len(subset) == 1: counts.index = MultiIndex.from_arrays( [counts.index], names=[counts.index.name] ) return counts
[docs] def nlargest(self, n, columns, keep="first") -> DataFrame: """ Return the first `n` rows ordered by `columns` in descending order. Return the first `n` rows with the largest values in `columns`, in descending order. The columns that are not specified are returned as well, but not used for ordering. This method is equivalent to ``df.sort_values(columns, ascending=False).head(n)``, but more performant. Parameters ---------- n : int Number of rows to return. columns : label or list of labels Column label(s) to order by. keep : {'first', 'last', 'all'}, default 'first' Where there are duplicate values: - `first` : prioritize the first occurrence(s) - `last` : prioritize the last occurrence(s) - ``all`` : do not drop any duplicates, even it means selecting more than `n` items. .. versionadded:: 0.24.0 Returns ------- DataFrame The first `n` rows ordered by the given columns in descending order. See Also -------- DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in ascending order. DataFrame.sort_values : Sort DataFrame by the values. DataFrame.head : Return the first `n` rows without re-ordering. Notes ----- This function cannot be used with all column types. For example, when specifying columns with `object` or `category` dtypes, ``TypeError`` is raised. Examples -------- >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, ... 434000, 434000, 337000, 11300, ... 11300, 11300], ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, ... 17036, 182, 38, 311], ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", ... "IS", "NR", "TV", "AI"]}, ... index=["Italy", "France", "Malta", ... "Maldives", "Brunei", "Iceland", ... "Nauru", "Tuvalu", "Anguilla"]) >>> df population GDP alpha-2 Italy 59000000 1937894 IT France 65000000 2583560 FR Malta 434000 12011 MT Maldives 434000 4520 MV Brunei 434000 12128 BN Iceland 337000 17036 IS Nauru 11300 182 NR Tuvalu 11300 38 TV Anguilla 11300 311 AI In the following example, we will use ``nlargest`` to select the three rows having the largest values in column "population". >>> df.nlargest(3, 'population') population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT Malta 434000 12011 MT When using ``keep='last'``, ties are resolved in reverse order: >>> df.nlargest(3, 'population', keep='last') population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT Brunei 434000 12128 BN When using ``keep='all'``, all duplicate items are maintained: >>> df.nlargest(3, 'population', keep='all') population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT Malta 434000 12011 MT Maldives 434000 4520 MV Brunei 434000 12128 BN To order by the largest values in column "population" and then "GDP", we can specify multiple columns like in the next example. >>> df.nlargest(3, ['population', 'GDP']) population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT Brunei 434000 12128 BN """ return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
[docs] def nsmallest(self, n, columns, keep="first") -> DataFrame: """ Return the first `n` rows ordered by `columns` in ascending order. Return the first `n` rows with the smallest values in `columns`, in ascending order. The columns that are not specified are returned as well, but not used for ordering. This method is equivalent to ``df.sort_values(columns, ascending=True).head(n)``, but more performant. Parameters ---------- n : int Number of items to retrieve. columns : list or str Column name or names to order by. keep : {'first', 'last', 'all'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. - ``all`` : do not drop any duplicates, even it means selecting more than `n` items. .. versionadded:: 0.24.0 Returns ------- DataFrame See Also -------- DataFrame.nlargest : Return the first `n` rows ordered by `columns` in descending order. DataFrame.sort_values : Sort DataFrame by the values. DataFrame.head : Return the first `n` rows without re-ordering. Examples -------- >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, ... 434000, 434000, 337000, 337000, ... 11300, 11300], ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, ... 17036, 182, 38, 311], ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", ... "IS", "NR", "TV", "AI"]}, ... index=["Italy", "France", "Malta", ... "Maldives", "Brunei", "Iceland", ... "Nauru", "Tuvalu", "Anguilla"]) >>> df population GDP alpha-2 Italy 59000000 1937894 IT France 65000000 2583560 FR Malta 434000 12011 MT Maldives 434000 4520 MV Brunei 434000 12128 BN Iceland 337000 17036 IS Nauru 337000 182 NR Tuvalu 11300 38 TV Anguilla 11300 311 AI In the following example, we will use ``nsmallest`` to select the three rows having the smallest values in column "population". >>> df.nsmallest(3, 'population') population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI Iceland 337000 17036 IS When using ``keep='last'``, ties are resolved in reverse order: >>> df.nsmallest(3, 'population', keep='last') population GDP alpha-2 Anguilla 11300 311 AI Tuvalu 11300 38 TV Nauru 337000 182 NR When using ``keep='all'``, all duplicate items are maintained: >>> df.nsmallest(3, 'population', keep='all') population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI Iceland 337000 17036 IS Nauru 337000 182 NR To order by the smallest values in column "population" and then "GDP", we can specify multiple columns like in the next example. >>> df.nsmallest(3, ['population', 'GDP']) population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI Nauru 337000 182 NR """ return algorithms.SelectNFrame( self, n=n, keep=keep, columns=columns ).nsmallest()
[docs] def swaplevel(self, i=-2, j=-1, axis=0) -> DataFrame: """ Swap levels i and j in a MultiIndex on a particular axis. Parameters ---------- i, j : int or str Levels of the indices to be swapped. Can pass level name as string. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to swap levels on. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. Returns ------- DataFrame """ result = self.copy() axis = self._get_axis_number(axis) if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover raise TypeError("Can only swap levels on a hierarchical axis.") if axis == 0: assert isinstance(result.index, MultiIndex) result.index = result.index.swaplevel(i, j) else: assert isinstance(result.columns, MultiIndex) result.columns = result.columns.swaplevel(i, j) return result
[docs] def reorder_levels(self, order, axis=0) -> DataFrame: """ Rearrange index levels using input order. May not drop or duplicate levels. Parameters ---------- order : list of int or list of str List representing new level order. Reference level by number (position) or by key (label). axis : {0 or 'index', 1 or 'columns'}, default 0 Where to reorder levels. Returns ------- DataFrame """ axis = self._get_axis_number(axis) if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover raise TypeError("Can only reorder levels on a hierarchical axis.") result = self.copy() if axis == 0: assert isinstance(result.index, MultiIndex) result.index = result.index.reorder_levels(order) else: assert isinstance(result.columns, MultiIndex) result.columns = result.columns.reorder_levels(order) return result
# ---------------------------------------------------------------------- # Arithmetic Methods def _cmp_method(self, other, op): axis = 1 # only relevant for Series other case self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None) # See GH#4537 for discussion of scalar op behavior new_data = self._dispatch_frame_op(other, op, axis=axis) return self._construct_result(new_data) def _arith_method(self, other, op): if ops.should_reindex_frame_op(self, other, op, 1, 1, None, None): return ops.frame_arith_method_with_reindex(self, other, op) axis = 1 # only relevant for Series other case self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None) new_data = self._dispatch_frame_op(other, op, axis=axis) return self._construct_result(new_data) _logical_method = _arith_method def _dispatch_frame_op(self, right, func, axis: Optional[int] = None): """ Evaluate the frame operation func(left, right) by evaluating column-by-column, dispatching to the Series implementation. Parameters ---------- right : scalar, Series, or DataFrame func : arithmetic or comparison operator axis : {None, 0, 1} Returns ------- DataFrame """ # Get the appropriate array-op to apply to each column/block's values. array_op = ops.get_array_op(func) right = lib.item_from_zerodim(right) if not is_list_like(right): # i.e. scalar, faster than checking np.ndim(right) == 0 bm = self._mgr.apply(array_op, right=right) return type(self)(bm) elif isinstance(right, DataFrame): assert self.index.equals(right.index) assert self.columns.equals(right.columns) # TODO: The previous assertion `assert right._indexed_same(self)` # fails in cases with empty columns reached via # _frame_arith_method_with_reindex bm = self._mgr.operate_blockwise(right._mgr, array_op) return type(self)(bm) elif isinstance(right, Series) and axis == 1: # axis=1 means we want to operate row-by-row assert right.index.equals(self.columns) right = right._values # maybe_align_as_frame ensures we do not have an ndarray here assert not isinstance(right, np.ndarray) arrays = [ array_op(_left, _right) for _left, _right in zip(self._iter_column_arrays(), right) ] elif isinstance(right, Series): assert right.index.equals(self.index) # Handle other cases later right = right._values arrays = [array_op(left, right) for left in self._iter_column_arrays()] else: # Remaining cases have less-obvious dispatch rules raise NotImplementedError(right) return type(self)._from_arrays( arrays, self.columns, self.index, verify_integrity=False ) def _combine_frame(self, other: DataFrame, func, fill_value=None): # at this point we have `self._indexed_same(other)` if fill_value is None: # since _arith_op may be called in a loop, avoid function call # overhead if possible by doing this check once _arith_op = func else: def _arith_op(left, right): # for the mixed_type case where we iterate over columns, # _arith_op(left, right) is equivalent to # left._binop(right, func, fill_value=fill_value) left, right = ops.fill_binop(left, right, fill_value) return func(left, right) new_data = self._dispatch_frame_op(other, _arith_op) return new_data def _construct_result(self, result) -> DataFrame: """ Wrap the result of an arithmetic, comparison, or logical operation. Parameters ---------- result : DataFrame Returns ------- DataFrame """ out = self._constructor(result, copy=False) # Pin columns instead of passing to constructor for compat with # non-unique columns case out.columns = self.columns out.index = self.index return out def __divmod__(self, other) -> Tuple[DataFrame, DataFrame]: # Naive implementation, room for optimization div = self // other mod = self - div * other return div, mod def __rdivmod__(self, other) -> Tuple[DataFrame, DataFrame]: # Naive implementation, room for optimization div = other // self mod = other - div * self return div, mod # ---------------------------------------------------------------------- # Combination-Related
[docs] @doc( _shared_docs["compare"], """ Returns ------- DataFrame DataFrame that shows the differences stacked side by side. The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. Raises ------ ValueError When the two DataFrames don't have identical labels or shape. See Also -------- Series.compare : Compare with another Series and show differences. DataFrame.equals : Test whether two objects contain the same elements. Notes ----- Matching NaNs will not appear as a difference. Can only compare identically-labeled (i.e. same shape, identical row and column labels) DataFrames Examples -------- >>> df = pd.DataFrame( ... {{ ... "col1": ["a", "a", "b", "b", "a"], ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] ... }}, ... columns=["col1", "col2", "col3"], ... ) >>> df col1 col2 col3 0 a 1.0 1.0 1 a 2.0 2.0 2 b 3.0 3.0 3 b NaN 4.0 4 a 5.0 5.0 >>> df2 = df.copy() >>> df2.loc[0, 'col1'] = 'c' >>> df2.loc[2, 'col3'] = 4.0 >>> df2 col1 col2 col3 0 c 1.0 1.0 1 a 2.0 2.0 2 b 3.0 4.0 3 b NaN 4.0 4 a 5.0 5.0 Align the differences on columns >>> df.compare(df2) col1 col3 self other self other 0 a c NaN NaN 2 NaN NaN 3.0 4.0 Stack the differences on rows >>> df.compare(df2, align_axis=0) col1 col3 0 self a NaN other c NaN 2 self NaN 3.0 other NaN 4.0 Keep the equal values >>> df.compare(df2, keep_equal=True) col1 col3 self other self other 0 a c 1.0 1.0 2 b b 3.0 4.0 Keep all original rows and columns >>> df.compare(df2, keep_shape=True) col1 col2 col3 self other self other self other 0 a c NaN NaN NaN NaN 1 NaN NaN NaN NaN NaN NaN 2 NaN NaN NaN NaN 3.0 4.0 3 NaN NaN NaN NaN NaN NaN 4 NaN NaN NaN NaN NaN NaN Keep all original rows and columns and also all original values >>> df.compare(df2, keep_shape=True, keep_equal=True) col1 col2 col3 self other self other self other 0 a c 1.0 1.0 1.0 1.0 1 a a 2.0 2.0 2.0 2.0 2 b b 3.0 3.0 3.0 4.0 3 b b NaN NaN 4.0 4.0 4 a a 5.0 5.0 5.0 5.0 """, klass=_shared_doc_kwargs["klass"], ) def compare( self, other: DataFrame, align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, ) -> DataFrame: return super().compare( other=other, align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, )
[docs] def combine( self, other: DataFrame, func, fill_value=None, overwrite=True ) -> DataFrame: """ Perform column-wise combine with another DataFrame. Combines a DataFrame with `other` DataFrame using `func` to element-wise combine columns. The row and column indexes of the resulting DataFrame will be the union of the two. Parameters ---------- other : DataFrame The DataFrame to merge column-wise. func : function Function that takes two series as inputs and return a Series or a scalar. Used to merge the two dataframes column by columns. fill_value : scalar value, default None The value to fill NaNs with prior to passing any column to the merge func. overwrite : bool, default True If True, columns in `self` that do not exist in `other` will be overwritten with NaNs. Returns ------- DataFrame Combination of the provided DataFrames. See Also -------- DataFrame.combine_first : Combine two DataFrame objects and default to non-null values in frame calling the method. Examples -------- Combine using a simple function that chooses the smaller column. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 >>> df1.combine(df2, take_smaller) A B 0 0 3 1 0 3 Example using a true element-wise combine function. >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine(df2, np.minimum) A B 0 1 2 1 0 3 Using `fill_value` fills Nones prior to passing the column to the merge function. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine(df2, take_smaller, fill_value=-5) A B 0 0 -5.0 1 0 4.0 However, if the same element in both dataframes is None, that None is preserved >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) >>> df1.combine(df2, take_smaller, fill_value=-5) A B 0 0 -5.0 1 0 3.0 Example that demonstrates the use of `overwrite` and behavior when the axis differ between the dataframes. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) >>> df1.combine(df2, take_smaller) A B C 0 NaN NaN NaN 1 NaN 3.0 -10.0 2 NaN 3.0 1.0 >>> df1.combine(df2, take_smaller, overwrite=False) A B C 0 0.0 NaN NaN 1 0.0 3.0 -10.0 2 NaN 3.0 1.0 Demonstrating the preference of the passed in dataframe. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) >>> df2.combine(df1, take_smaller) A B C 0 0.0 NaN NaN 1 0.0 3.0 NaN 2 NaN 3.0 NaN >>> df2.combine(df1, take_smaller, overwrite=False) A B C 0 0.0 NaN NaN 1 0.0 3.0 1.0 2 NaN 3.0 1.0 """ other_idxlen = len(other.index) # save for compare this, other = self.align(other, copy=False) new_index = this.index if other.empty and len(new_index) == len(self.index): return self.copy() if self.empty and len(other) == other_idxlen: return other.copy() # sorts if possible new_columns = this.columns.union(other.columns) do_fill = fill_value is not None result = {} for col in new_columns: series = this[col] otherSeries = other[col] this_dtype = series.dtype other_dtype = otherSeries.dtype this_mask = isna(series) other_mask = isna(otherSeries) # don't overwrite columns unnecessarily # DO propagate if this column is not in the intersection if not overwrite and other_mask.all(): result[col] = this[col].copy() continue if do_fill: series = series.copy() otherSeries = otherSeries.copy() series[this_mask] = fill_value otherSeries[other_mask] = fill_value if col not in self.columns: # If self DataFrame does not have col in other DataFrame, # try to promote series, which is all NaN, as other_dtype. new_dtype = other_dtype try: series = series.astype(new_dtype, copy=False) except ValueError: # e.g. new_dtype is integer types pass else: # if we have different dtypes, possibly promote new_dtype = find_common_type([this_dtype, other_dtype]) if not is_dtype_equal(this_dtype, new_dtype): series = series.astype(new_dtype) if not is_dtype_equal(other_dtype, new_dtype): otherSeries = otherSeries.astype(new_dtype) arr = func(series, otherSeries) arr = maybe_downcast_to_dtype(arr, new_dtype) result[col] = arr # convert_objects just in case return self._constructor(result, index=new_index, columns=new_columns)
[docs] def combine_first(self, other: DataFrame) -> DataFrame: """ Update null elements with value in the same location in `other`. Combine two DataFrame objects by filling null values in one DataFrame with non-null values from other DataFrame. The row and column indexes of the resulting DataFrame will be the union of the two. Parameters ---------- other : DataFrame Provided DataFrame to use to fill null values. Returns ------- DataFrame See Also -------- DataFrame.combine : Perform series-wise operation on two DataFrames using a given function. Examples -------- >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine_first(df2) A B 0 1.0 3.0 1 0.0 4.0 Null values still persist if the location of that null value does not exist in `other` >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) >>> df1.combine_first(df2) A B C 0 NaN 4.0 NaN 1 0.0 3.0 1.0 2 NaN 3.0 1.0 """ import pandas.core.computation.expressions as expressions def combiner(x, y): mask = extract_array(isna(x)) x_values = extract_array(x, extract_numpy=True) y_values = extract_array(y, extract_numpy=True) # If the column y in other DataFrame is not in first DataFrame, # just return y_values. if y.name not in self.columns: return y_values return expressions.where(mask, y_values, x_values) return self.combine(other, combiner, overwrite=False)
[docs] def update( self, other, join="left", overwrite=True, filter_func=None, errors="ignore" ) -> None: """ Modify in place using non-NA values from another DataFrame. Aligns on indices. There is no return value. Parameters ---------- other : DataFrame, or object coercible into a DataFrame Should have at least one matching index/column label with the original DataFrame. If a Series is passed, its name attribute must be set, and that will be used as the column name to align with the original DataFrame. join : {'left'}, default 'left' Only left join is implemented, keeping the index and columns of the original object. overwrite : bool, default True How to handle non-NA values for overlapping keys: * True: overwrite original DataFrame's values with values from `other`. * False: only update values that are NA in the original DataFrame. filter_func : callable(1d-array) -> bool 1d-array, optional Can choose to replace values other than NA. Return True for values that should be updated. errors : {'raise', 'ignore'}, default 'ignore' If 'raise', will raise a ValueError if the DataFrame and `other` both contain non-NA data in the same place. .. versionchanged:: 0.24.0 Changed from `raise_conflict=False|True` to `errors='ignore'|'raise'`. Returns ------- None : method directly changes calling object Raises ------ ValueError * When `errors='raise'` and there's overlapping non-NA data. * When `errors` is not either `'ignore'` or `'raise'` NotImplementedError * If `join != 'left'` See Also -------- dict.update : Similar method for dictionaries. DataFrame.merge : For column(s)-on-column(s) operations. Examples -------- >>> df = pd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600]}) >>> new_df = pd.DataFrame({'B': [4, 5, 6], ... 'C': [7, 8, 9]}) >>> df.update(new_df) >>> df A B 0 1 4 1 2 5 2 3 6 The DataFrame's length does not increase as a result of the update, only values at matching index/column labels are updated. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], ... 'B': ['x', 'y', 'z']}) >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) >>> df.update(new_df) >>> df A B 0 a d 1 b e 2 c f For Series, its name attribute must be set. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], ... 'B': ['x', 'y', 'z']}) >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) >>> df.update(new_column) >>> df A B 0 a d 1 b y 2 c e >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], ... 'B': ['x', 'y', 'z']}) >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) >>> df.update(new_df) >>> df A B 0 a x 1 b d 2 c e If `other` contains NaNs the corresponding values are not updated in the original dataframe. >>> df = pd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600]}) >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) >>> df.update(new_df) >>> df A B 0 1 4.0 1 2 500.0 2 3 6.0 """ import pandas.core.computation.expressions as expressions # TODO: Support other joins if join != "left": # pragma: no cover raise NotImplementedError("Only left join is supported") if errors not in ["ignore", "raise"]: raise ValueError("The parameter errors must be either 'ignore' or 'raise'") if not isinstance(other, DataFrame): other = DataFrame(other) other = other.reindex_like(self) for col in self.columns: this = self[col]._values that = other[col]._values if filter_func is not None: with np.errstate(all="ignore"): mask = ~filter_func(this) | isna(that) else: if errors == "raise": mask_this = notna(that) mask_that = notna(this) if any(mask_this & mask_that): raise ValueError("Data overlaps.") if overwrite: mask = isna(that) else: mask = notna(this) # don't overwrite columns unnecessarily if mask.all(): continue self[col] = expressions.where(mask, this, that)
# ---------------------------------------------------------------------- # Data reshaping
[docs] @Appender( """ Examples -------- >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) >>> df Animal Max Speed 0 Falcon 380.0 1 Falcon 370.0 2 Parrot 24.0 3 Parrot 26.0 >>> df.groupby(['Animal']).mean() Max Speed Animal Falcon 375.0 Parrot 25.0 **Hierarchical Indexes** We can groupby different levels of a hierarchical index using the `level` parameter: >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], ... ['Captive', 'Wild', 'Captive', 'Wild']] >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, ... index=index) >>> df Max Speed Animal Type Falcon Captive 390.0 Wild 350.0 Parrot Captive 30.0 Wild 20.0 >>> df.groupby(level=0).mean() Max Speed Animal Falcon 370.0 Parrot 25.0 >>> df.groupby(level="Type").mean() Max Speed Type Captive 210.0 Wild 185.0 We can also choose to include NA in group keys or not by setting `dropna` parameter, the default setting is `True`: >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) >>> df.groupby(by=["b"]).sum() a c b 1.0 2 3 2.0 2 5 >>> df.groupby(by=["b"], dropna=False).sum() a c b 1.0 2 3 2.0 2 5 NaN 1 4 >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) >>> df.groupby(by="a").sum() b c a a 13.0 13.0 b 12.3 123.0 >>> df.groupby(by="a", dropna=False).sum() b c a a 13.0 13.0 b 12.3 123.0 NaN 12.3 33.0 """ ) @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) def groupby( self, by=None, axis=0, level=None, as_index: bool = True, sort: bool = True, group_keys: bool = True, squeeze: bool = no_default, observed: bool = False, dropna: bool = True, ) -> DataFrameGroupBy: from pandas.core.groupby.generic import DataFrameGroupBy if squeeze is not no_default: warnings.warn( ( "The `squeeze` parameter is deprecated and " "will be removed in a future version." ), FutureWarning, stacklevel=2, ) else: squeeze = False if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) return DataFrameGroupBy( obj=self, keys=by, axis=axis, level=level, as_index=as_index, sort=sort, group_keys=group_keys, squeeze=squeeze, observed=observed, dropna=dropna, )
_shared_docs[ "pivot" ] = """ Return reshaped DataFrame organized by given index / column values. Reshape data (produce a "pivot" table) based on column values. Uses unique values from specified `index` / `columns` to form axes of the resulting DataFrame. This function does not support data aggregation, multiple values will result in a MultiIndex in the columns. See the :ref:`User Guide <reshaping>` for more on reshaping. Parameters ----------%s index : str or object or a list of str, optional Column to use to make new frame's index. If None, uses existing index. .. versionchanged:: 1.1.0 Also accept list of index names. columns : str or object or a list of str Column to use to make new frame's columns. .. versionchanged:: 1.1.0 Also accept list of columns names. values : str, object or a list of the previous, optional Column(s) to use for populating new frame's values. If not specified, all remaining columns will be used and the result will have hierarchically indexed columns. Returns ------- DataFrame Returns reshaped DataFrame. Raises ------ ValueError: When there are any `index`, `columns` combinations with multiple values. `DataFrame.pivot_table` when you need to aggregate. See Also -------- DataFrame.pivot_table : Generalization of pivot that can handle duplicate values for one index/column pair. DataFrame.unstack : Pivot based on the index values instead of a column. wide_to_long : Wide panel to long format. Less flexible but more user-friendly than melt. Notes ----- For finer-tuned control, see hierarchical indexing documentation along with the related stack/unstack methods. Examples -------- >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', ... 'two'], ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], ... 'baz': [1, 2, 3, 4, 5, 6], ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) >>> df foo bar baz zoo 0 one A 1 x 1 one B 2 y 2 one C 3 z 3 two A 4 q 4 two B 5 w 5 two C 6 t >>> df.pivot(index='foo', columns='bar', values='baz') bar A B C foo one 1 2 3 two 4 5 6 >>> df.pivot(index='foo', columns='bar')['baz'] bar A B C foo one 1 2 3 two 4 5 6 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo']) baz zoo bar A B C A B C foo one 1 2 3 x y z two 4 5 6 q w t You could also assign a list of column names or a list of index names. >>> df = pd.DataFrame({ ... "lev1": [1, 1, 1, 2, 2, 2], ... "lev2": [1, 1, 2, 1, 1, 2], ... "lev3": [1, 2, 1, 2, 1, 2], ... "lev4": [1, 2, 3, 4, 5, 6], ... "values": [0, 1, 2, 3, 4, 5]}) >>> df lev1 lev2 lev3 lev4 values 0 1 1 1 1 0 1 1 1 2 2 1 2 1 2 1 3 2 3 2 1 2 4 3 4 2 1 1 5 4 5 2 2 2 6 5 >>> df.pivot(index="lev1", columns=["lev2", "lev3"],values="values") lev2 1 2 lev3 1 2 1 2 lev1 1 0.0 1.0 2.0 NaN 2 4.0 3.0 NaN 5.0 >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values") lev3 1 2 lev1 lev2 1 1 0.0 1.0 2 2.0 NaN 2 1 4.0 3.0 2 NaN 5.0 A ValueError is raised if there are any duplicates. >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'], ... "bar": ['A', 'A', 'B', 'C'], ... "baz": [1, 2, 3, 4]}) >>> df foo bar baz 0 one A 1 1 one A 2 2 two B 3 3 two C 4 Notice that the first two rows are the same for our `index` and `columns` arguments. >>> df.pivot(index='foo', columns='bar', values='baz') Traceback (most recent call last): ... ValueError: Index contains duplicate entries, cannot reshape """
[docs] @Substitution("") @Appender(_shared_docs["pivot"]) def pivot(self, index=None, columns=None, values=None) -> DataFrame: from pandas.core.reshape.pivot import pivot return pivot(self, index=index, columns=columns, values=values)
_shared_docs[ "pivot_table" ] = """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame. Parameters ----------%s values : column to aggregate, optional index : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. columns : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. aggfunc : function, list of functions, dict, default numpy.mean If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) If dict is passed, the key is column to aggregate and value is function or list of functions. fill_value : scalar, default None Value to replace missing values with (in the resulting pivot table, after aggregation). margins : bool, default False Add all row / columns (e.g. for subtotal / grand totals). dropna : bool, default True Do not include columns whose entries are all NaN. margins_name : str, default 'All' Name of the row / column that will contain the totals when margins is True. observed : bool, default False This only applies if any of the groupers are Categoricals. If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. .. versionchanged:: 0.25.0 Returns ------- DataFrame An Excel style pivot table. See Also -------- DataFrame.pivot : Pivot without aggregation that can handle non-numeric data. DataFrame.melt: Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. wide_to_long : Wide panel to long format. Less flexible but more user-friendly than melt. Examples -------- >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", ... "bar", "bar", "bar", "bar"], ... "B": ["one", "one", "one", "two", "two", ... "one", "one", "two", "two"], ... "C": ["small", "large", "large", "small", ... "small", "large", "small", "small", ... "large"], ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) >>> df A B C D E 0 foo one small 1 2 1 foo one large 2 4 2 foo one large 2 5 3 foo two small 3 5 4 foo two small 3 6 5 bar one large 4 6 6 bar one small 5 8 7 bar two small 6 9 8 bar two large 7 9 This first example aggregates values by taking the sum. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table C large small A B bar one 4.0 5.0 two 7.0 6.0 foo one 4.0 1.0 two NaN 6.0 We can also fill missing values using the `fill_value` parameter. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum, fill_value=0) >>> table C large small A B bar one 4 5 two 7 6 foo one 4 1 two 0 6 The next example aggregates by taking the mean across multiple columns. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], ... aggfunc={'D': np.mean, ... 'E': np.mean}) >>> table D E A C bar large 5.500000 7.500000 small 5.500000 8.500000 foo large 2.000000 4.500000 small 2.333333 4.333333 We can also calculate multiple types of aggregations for any given value column. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], ... aggfunc={'D': np.mean, ... 'E': [min, max, np.mean]}) >>> table D E mean max mean min A C bar large 5.500000 9.0 7.500000 6.0 small 5.500000 9.0 8.500000 8.0 foo large 2.000000 5.0 4.500000 4.0 small 2.333333 6.0 4.333333 2.0 """
[docs] @Substitution("") @Appender(_shared_docs["pivot_table"]) def pivot_table( self, values=None, index=None, columns=None, aggfunc="mean", fill_value=None, margins=False, dropna=True, margins_name="All", observed=False, ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table return pivot_table( self, values=values, index=index, columns=columns, aggfunc=aggfunc, fill_value=fill_value, margins=margins, dropna=dropna, margins_name=margins_name, observed=observed, )
[docs] def stack(self, level=-1, dropna=True): """ Stack the prescribed level(s) from columns to index. Return a reshaped DataFrame or Series having a multi-level index with one or more new inner-most levels compared to the current DataFrame. The new inner-most levels are created by pivoting the columns of the current dataframe: - if the columns have a single level, the output is a Series; - if the columns have multiple levels, the new index level(s) is (are) taken from the prescribed level(s) and the output is a DataFrame. Parameters ---------- level : int, str, list, default -1 Level(s) to stack from the column axis onto the index axis, defined as one index or label, or a list of indices or labels. dropna : bool, default True Whether to drop rows in the resulting Frame/Series with missing values. Stacking a column level onto the index axis can create combinations of index and column values that are missing from the original dataframe. See Examples section. Returns ------- DataFrame or Series Stacked dataframe or series. See Also -------- DataFrame.unstack : Unstack prescribed level(s) from index axis onto column axis. DataFrame.pivot : Reshape dataframe from long format to wide format. DataFrame.pivot_table : Create a spreadsheet-style pivot table as a DataFrame. Notes ----- The function is named by analogy with a collection of books being reorganized from being side by side on a horizontal position (the columns of the dataframe) to being stacked vertically on top of each other (in the index of the dataframe). Examples -------- **Single level columns** >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], ... index=['cat', 'dog'], ... columns=['weight', 'height']) Stacking a dataframe with a single level column axis returns a Series: >>> df_single_level_cols weight height cat 0 1 dog 2 3 >>> df_single_level_cols.stack() cat weight 0 height 1 dog weight 2 height 3 dtype: int64 **Multi level columns: simple case** >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), ... ('weight', 'pounds')]) >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], ... index=['cat', 'dog'], ... columns=multicol1) Stacking a dataframe with a multi-level column axis: >>> df_multi_level_cols1 weight kg pounds cat 1 2 dog 2 4 >>> df_multi_level_cols1.stack() weight cat kg 1 pounds 2 dog kg 2 pounds 4 **Missing values** >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), ... ('height', 'm')]) >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], ... index=['cat', 'dog'], ... columns=multicol2) It is common to have missing values when stacking a dataframe with multi-level columns, as the stacked dataframe typically has more values than the original dataframe. Missing values are filled with NaNs: >>> df_multi_level_cols2 weight height kg m cat 1.0 2.0 dog 3.0 4.0 >>> df_multi_level_cols2.stack() height weight cat kg NaN 1.0 m 2.0 NaN dog kg NaN 3.0 m 4.0 NaN **Prescribing the level(s) to be stacked** The first parameter controls which level or levels are stacked: >>> df_multi_level_cols2.stack(0) kg m cat height NaN 2.0 weight 1.0 NaN dog height NaN 4.0 weight 3.0 NaN >>> df_multi_level_cols2.stack([0, 1]) cat height m 2.0 weight kg 1.0 dog height m 4.0 weight kg 3.0 dtype: float64 **Dropping missing values** >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]], ... index=['cat', 'dog'], ... columns=multicol2) Note that rows where all values are missing are dropped by default but this behaviour can be controlled via the dropna keyword parameter: >>> df_multi_level_cols3 weight height kg m cat NaN 1.0 dog 2.0 3.0 >>> df_multi_level_cols3.stack(dropna=False) height weight cat kg NaN NaN m 1.0 NaN dog kg NaN 2.0 m 3.0 NaN >>> df_multi_level_cols3.stack(dropna=True) height weight cat m 1.0 NaN dog kg NaN 2.0 m 3.0 NaN """ from pandas.core.reshape.reshape import stack, stack_multiple if isinstance(level, (tuple, list)): result = stack_multiple(self, level, dropna=dropna) else: result = stack(self, level, dropna=dropna) return result.__finalize__(self, method="stack")
[docs] def explode( self, column: Union[str, Tuple], ignore_index: bool = False ) -> DataFrame: """ Transform each element of a list-like to a row, replicating index values. .. versionadded:: 0.25.0 Parameters ---------- column : str or tuple Column to explode. ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, …, n - 1. .. versionadded:: 1.1.0 Returns ------- DataFrame Exploded lists to rows of the subset columns; index will be duplicated for these rows. Raises ------ ValueError : if columns of the frame are not unique. See Also -------- DataFrame.unstack : Pivot a level of the (necessarily hierarchical) index labels. DataFrame.melt : Unpivot a DataFrame from wide format to long format. Series.explode : Explode a DataFrame from list-like columns to long format. Notes ----- This routine will explode list-likes including lists, tuples, sets, Series, and np.ndarray. The result dtype of the subset rows will be object. Scalars will be returned unchanged, and empty list-likes will result in a np.nan for that row. In addition, the ordering of rows in the output will be non-deterministic when exploding sets. Examples -------- >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) >>> df A B 0 [1, 2, 3] 1 1 foo 1 2 [] 1 3 [3, 4] 1 >>> df.explode('A') A B 0 1 1 0 2 1 0 3 1 1 foo 1 2 NaN 1 3 3 1 3 4 1 """ if not (is_scalar(column) or isinstance(column, tuple)): raise ValueError("column must be a scalar") if not self.columns.is_unique: raise ValueError("columns must be unique") df = self.reset_index(drop=True) result = df[column].explode() result = df.drop([column], axis=1).join(result) if ignore_index: result.index = ibase.default_index(len(result)) else: result.index = self.index.take(result.index) result = result.reindex(columns=self.columns, copy=False) return result
[docs] def unstack(self, level=-1, fill_value=None): """ Pivot a level of the (necessarily hierarchical) index labels. Returns a DataFrame having a new level of column labels whose inner-most level consists of the pivoted index labels. If the index is not a MultiIndex, the output will be a Series (the analogue of stack when the columns are not a MultiIndex). Parameters ---------- level : int, str, or list of these, default -1 (last level) Level(s) of index to unstack, can pass level name. fill_value : int, str or dict Replace NaN with this value if the unstack produces missing values. Returns ------- Series or DataFrame See Also -------- DataFrame.pivot : Pivot a table based on column values. DataFrame.stack : Pivot a level of the column labels (inverse operation from `unstack`). Examples -------- >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ... ('two', 'a'), ('two', 'b')]) >>> s = pd.Series(np.arange(1.0, 5.0), index=index) >>> s one a 1.0 b 2.0 two a 3.0 b 4.0 dtype: float64 >>> s.unstack(level=-1) a b one 1.0 2.0 two 3.0 4.0 >>> s.unstack(level=0) one two a 1.0 3.0 b 2.0 4.0 >>> df = s.unstack(level=0) >>> df.unstack() one a 1.0 b 2.0 two a 3.0 b 4.0 dtype: float64 """ from pandas.core.reshape.reshape import unstack result = unstack(self, level, fill_value) return result.__finalize__(self, method="unstack")
[docs] @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) def melt( self, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ignore_index=True, ) -> DataFrame: return melt( self, id_vars=id_vars, value_vars=value_vars, var_name=var_name, value_name=value_name, col_level=col_level, ignore_index=ignore_index, )
# ---------------------------------------------------------------------- # Time series-related
[docs] @doc( Series.diff, klass="Dataframe", extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n " "Take difference over rows (0) or columns (1).\n", other_klass="Series", examples=dedent( """ Difference with previous row >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], ... 'b': [1, 1, 2, 3, 5, 8], ... 'c': [1, 4, 9, 16, 25, 36]}) >>> df a b c 0 1 1 1 1 2 1 4 2 3 2 9 3 4 3 16 4 5 5 25 5 6 8 36 >>> df.diff() a b c 0 NaN NaN NaN 1 1.0 0.0 3.0 2 1.0 1.0 5.0 3 1.0 1.0 7.0 4 1.0 2.0 9.0 5 1.0 3.0 11.0 Difference with previous column >>> df.diff(axis=1) a b c 0 NaN 0 0 1 NaN -1 3 2 NaN -1 7 3 NaN -1 13 4 NaN 0 20 5 NaN 2 28 Difference with 3rd previous row >>> df.diff(periods=3) a b c 0 NaN NaN NaN 1 NaN NaN NaN 2 NaN NaN NaN 3 3.0 2.0 15.0 4 3.0 4.0 21.0 5 3.0 6.0 27.0 Difference with following row >>> df.diff(periods=-1) a b c 0 -1.0 0.0 -3.0 1 -1.0 -1.0 -5.0 2 -1.0 -1.0 -7.0 3 -1.0 -2.0 -9.0 4 -1.0 -3.0 -11.0 5 NaN NaN NaN Overflow in input dtype >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8) >>> df.diff() a 0 NaN 1 255.0""" ), ) def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: if not isinstance(periods, int): if not (is_float(periods) and periods.is_integer()): raise ValueError("periods must be an integer") periods = int(periods) bm_axis = self._get_block_manager_axis(axis) if bm_axis == 0 and periods != 0: return self - self.shift(periods, axis=axis) new_data = self._mgr.diff(n=periods, axis=bm_axis) return self._constructor(new_data).__finalize__(self, "diff")
# ---------------------------------------------------------------------- # Function application def _gotitem( self, key: Union[Label, List[Label]], ndim: int, subset: Optional[FrameOrSeriesUnion] = None, ) -> FrameOrSeriesUnion: """ Sub-classes to define. Return a sliced object. Parameters ---------- key : string / list of selections ndim : 1,2 requested ndim of result subset : object, default None subset to act on """ if subset is None: subset = self elif subset.ndim == 1: # is Series return subset # TODO: _shallow_copy(subset)? return subset[key] _agg_summary_and_see_also_doc = dedent( """ The aggregation operations are always performed over an axis, either the index (default) or the column axis. This behavior is different from `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, `var`), where the default is to compute the aggregation of the flattened array, e.g., ``numpy.mean(arr_2d)`` as opposed to ``numpy.mean(arr_2d, axis=0)``. `agg` is an alias for `aggregate`. Use the alias. See Also -------- DataFrame.apply : Perform any type of operations. DataFrame.transform : Perform transformation type operations. core.groupby.GroupBy : Perform operations over groups. core.resample.Resampler : Perform operations over resampled bins. core.window.Rolling : Perform operations over rolling window. core.window.Expanding : Perform operations over expanding window. core.window.ExponentialMovingWindow : Perform operation over exponential weighted window. """ ) _agg_examples_doc = dedent( """ Examples -------- >>> df = pd.DataFrame([[1, 2, 3], ... [4, 5, 6], ... [7, 8, 9], ... [np.nan, np.nan, np.nan]], ... columns=['A', 'B', 'C']) Aggregate these functions over the rows. >>> df.agg(['sum', 'min']) A B C sum 12.0 15.0 18.0 min 1.0 2.0 3.0 Different aggregations per column. >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) A B sum 12.0 NaN min 1.0 2.0 max NaN 8.0 Aggregate different functions over the columns and rename the index of the resulting DataFrame. >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean)) A B C x 7.0 NaN NaN y NaN 2.0 NaN z NaN NaN 6.0 Aggregate over the columns. >>> df.agg("mean", axis="columns") 0 2.0 1 5.0 2 8.0 3 NaN dtype: float64 """ )
[docs] @doc( _shared_docs["aggregate"], klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], see_also=_agg_summary_and_see_also_doc, examples=_agg_examples_doc, ) def aggregate(self, func=None, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) relabeling, func, columns, order = reconstruct_func(func, **kwargs) result = None try: result, how = self._aggregate(func, axis, *args, **kwargs) except TypeError as err: exc = TypeError( "DataFrame constructor called with " f"incompatible data and dtype: {err}" ) raise exc from err if result is None: return self.apply(func, axis=axis, args=args, **kwargs) if relabeling: # This is to keep the order to columns occurrence unchanged, and also # keep the order of new columns occurrence unchanged # For the return values of reconstruct_func, if relabeling is # False, columns and order will be None. assert columns is not None assert order is not None result_in_dict = relabel_result(result, func, columns, order) result = DataFrame(result_in_dict, index=columns) return result
def _aggregate(self, arg, axis=0, *args, **kwargs): if axis == 1: # NDFrame.aggregate returns a tuple, and we need to transpose # only result result, how = aggregate(self.T, arg, *args, **kwargs) result = result.T if result is not None else result return result, how return aggregate(self, arg, *args, **kwargs) agg = aggregate
[docs] @doc( _shared_docs["transform"], klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], ) def transform( self, func: AggFuncType, axis: Axis = 0, *args, **kwargs ) -> DataFrame: result = transform(self, func, axis, *args, **kwargs) assert isinstance(result, DataFrame) return result
[docs] def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ Apply a function along an axis of the DataFrame. Objects passed to the function are Series objects whose index is either the DataFrame's index (``axis=0``) or the DataFrame's columns (``axis=1``). By default (``result_type=None``), the final return type is inferred from the return type of the applied function. Otherwise, it depends on the `result_type` argument. Parameters ---------- func : function Function to apply to each column or row. axis : {0 or 'index', 1 or 'columns'}, default 0 Axis along which the function is applied: * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. raw : bool, default False Determines if row or column is passed as a Series or ndarray object: * ``False`` : passes each row or column as a Series to the function. * ``True`` : the passed function will receive ndarray objects instead. If you are just applying a NumPy reduction function this will achieve much better performance. result_type : {'expand', 'reduce', 'broadcast', None}, default None These only act when ``axis=1`` (columns): * 'expand' : list-like results will be turned into columns. * 'reduce' : returns a Series if possible rather than expanding list-like results. This is the opposite of 'expand'. * 'broadcast' : results will be broadcast to the original shape of the DataFrame, the original index and columns will be retained. The default behaviour (None) depends on the return value of the applied function: list-like results will be returned as a Series of those. However if the apply function returns a Series these are expanded to columns. args : tuple Positional arguments to pass to `func` in addition to the array/series. **kwds Additional keyword arguments to pass as keywords arguments to `func`. Returns ------- Series or DataFrame Result of applying ``func`` along the given axis of the DataFrame. See Also -------- DataFrame.applymap: For elementwise operations. DataFrame.aggregate: Only perform aggregating type operations. DataFrame.transform: Only perform transforming type operations. Examples -------- >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) >>> df A B 0 4 9 1 4 9 2 4 9 Using a numpy universal function (in this case the same as ``np.sqrt(df)``): >>> df.apply(np.sqrt) A B 0 2.0 3.0 1 2.0 3.0 2 2.0 3.0 Using a reducing function on either axis >>> df.apply(np.sum, axis=0) A 12 B 27 dtype: int64 >>> df.apply(np.sum, axis=1) 0 13 1 13 2 13 dtype: int64 Returning a list-like will result in a Series >>> df.apply(lambda x: [1, 2], axis=1) 0 [1, 2] 1 [1, 2] 2 [1, 2] dtype: object Passing ``result_type='expand'`` will expand list-like results to columns of a Dataframe >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') 0 1 0 1 2 1 1 2 2 1 2 Returning a Series inside the function is similar to passing ``result_type='expand'``. The resulting column names will be the Series index. >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) foo bar 0 1 2 1 1 2 2 1 2 Passing ``result_type='broadcast'`` will ensure the same shape result, whether list-like or scalar is returned by the function, and broadcast it along the axis. The resulting column names will be the originals. >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') A B 0 1 2 1 1 2 2 1 2 """ from pandas.core.apply import frame_apply op = frame_apply( self, func=func, axis=axis, raw=raw, result_type=result_type, args=args, kwds=kwds, ) return op.get_result()
[docs] def applymap(self, func, na_action: Optional[str] = None) -> DataFrame: """ Apply a function to a Dataframe elementwise. This method applies a function that accepts and returns a scalar to every element of a DataFrame. Parameters ---------- func : callable Python function, returns a single value from a single value. na_action : {None, 'ignore'}, default None If ‘ignore’, propagate NaN values, without passing them to func. .. versionadded:: 1.2 Returns ------- DataFrame Transformed DataFrame. See Also -------- DataFrame.apply : Apply a function along input axis of DataFrame. Examples -------- >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) >>> df 0 1 0 1.000 2.120 1 3.356 4.567 >>> df.applymap(lambda x: len(str(x))) 0 1 0 3 4 1 5 5 Like Series.map, NA values can be ignored: >>> df_copy = df.copy() >>> df_copy.iloc[0, 0] = pd.NA >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore') 0 1 0 <NA> 4 1 5 5 Note that a vectorized version of `func` often exists, which will be much faster. You could square each number elementwise. >>> df.applymap(lambda x: x**2) 0 1 0 1.000000 4.494400 1 11.262736 20.857489 But it's better to avoid applymap in that case. >>> df ** 2 0 1 0 1.000000 4.494400 1 11.262736 20.857489 """ if na_action not in {"ignore", None}: raise ValueError( f"na_action must be 'ignore' or None. Got {repr(na_action)}" ) ignore_na = na_action == "ignore" # if we have a dtype == 'M8[ns]', provide boxed values def infer(x): if x.empty: return lib.map_infer(x, func, ignore_na=ignore_na) return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na) return self.apply(infer).__finalize__(self, "applymap")
# ---------------------------------------------------------------------- # Merging / joining methods
[docs] def append( self, other, ignore_index=False, verify_integrity=False, sort=False ) -> DataFrame: """ Append rows of `other` to the end of caller, returning a new object. Columns in `other` that are not in the caller are added as new columns. Parameters ---------- other : DataFrame or Series/dict-like object, or list of these The data to append. ignore_index : bool, default False If True, the resulting axis will be labeled 0, 1, …, n - 1. verify_integrity : bool, default False If True, raise ValueError on creating index with duplicates. sort : bool, default False Sort columns if the columns of `self` and `other` are not aligned. .. versionchanged:: 1.0.0 Changed to not sort by default. Returns ------- DataFrame See Also -------- concat : General function to concatenate DataFrame or Series objects. Notes ----- If a list of dict/series is passed and the keys are all contained in the DataFrame's index, the order of the columns in the resulting DataFrame will be unchanged. Iteratively appending rows to a DataFrame can be more computationally intensive than a single concatenate. A better solution is to append those rows to a list and then concatenate the list with the original DataFrame all at once. Examples -------- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) >>> df A B 0 1 2 1 3 4 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB')) >>> df.append(df2) A B 0 1 2 1 3 4 0 5 6 1 7 8 With `ignore_index` set to True: >>> df.append(df2, ignore_index=True) A B 0 1 2 1 3 4 2 5 6 3 7 8 The following, while not recommended methods for generating DataFrames, show two ways to generate a DataFrame from multiple data sources. Less efficient: >>> df = pd.DataFrame(columns=['A']) >>> for i in range(5): ... df = df.append({'A': i}, ignore_index=True) >>> df A 0 0 1 1 2 2 3 3 4 4 More efficient: >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)], ... ignore_index=True) A 0 0 1 1 2 2 3 3 4 4 """ if isinstance(other, (Series, dict)): if isinstance(other, dict): if not ignore_index: raise TypeError("Can only append a dict if ignore_index=True") other = Series(other) if other.name is None and not ignore_index: raise TypeError( "Can only append a Series if ignore_index=True " "or if the Series has a name" ) index = Index([other.name], name=self.index.name) idx_diff = other.index.difference(self.columns) try: combined_columns = self.columns.append(idx_diff) except TypeError: combined_columns = self.columns.astype(object).append(idx_diff) other = ( other.reindex(combined_columns, copy=False) .to_frame() .T.infer_objects() .rename_axis(index.names, copy=False) ) if not self.columns.equals(combined_columns): self = self.reindex(columns=combined_columns) elif isinstance(other, list): if not other: pass elif not isinstance(other[0], DataFrame): other = DataFrame(other) if (self.columns.get_indexer(other.columns) >= 0).all(): other = other.reindex(columns=self.columns) from pandas.core.reshape.concat import concat if isinstance(other, (list, tuple)): to_concat = [self, *other] else: to_concat = [self, other] return ( concat( to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity, sort=sort, ) ).__finalize__(self, method="append")
[docs] def join( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False ) -> DataFrame: """ Join columns of another DataFrame. Join columns with `other` DataFrame either on index or on a key column. Efficiently join multiple DataFrame objects by index at once by passing a list. Parameters ---------- other : DataFrame, Series, or list of DataFrame Index should be similar to one of the columns in this one. If a Series is passed, its name attribute must be set, and that will be used as the column name in the resulting joined DataFrame. on : str, list of str, or array-like, optional Column or index level name(s) in the caller to join on the index in `other`, otherwise joins index-on-index. If multiple values given, the `other` DataFrame must have a MultiIndex. Can pass an array as the join key if it is not already contained in the calling DataFrame. Like an Excel VLOOKUP operation. how : {'left', 'right', 'outer', 'inner'}, default 'left' How to handle the operation of the two objects. * left: use calling frame's index (or column if on is specified) * right: use `other`'s index. * outer: form union of calling frame's index (or column if on is specified) with `other`'s index, and sort it. lexicographically. * inner: form intersection of calling frame's index (or column if on is specified) with `other`'s index, preserving the order of the calling's one. lsuffix : str, default '' Suffix to use from left frame's overlapping columns. rsuffix : str, default '' Suffix to use from right frame's overlapping columns. sort : bool, default False Order result DataFrame lexicographically by the join key. If False, the order of the join key depends on the join type (how keyword). Returns ------- DataFrame A dataframe containing columns from both the caller and `other`. See Also -------- DataFrame.merge : For column(s)-on-column(s) operations. Notes ----- Parameters `on`, `lsuffix`, and `rsuffix` are not supported when passing a list of `DataFrame` objects. Support for specifying index levels as the `on` parameter was added in version 0.23.0. Examples -------- >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) >>> df key A 0 K0 A0 1 K1 A1 2 K2 A2 3 K3 A3 4 K4 A4 5 K5 A5 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], ... 'B': ['B0', 'B1', 'B2']}) >>> other key B 0 K0 B0 1 K1 B1 2 K2 B2 Join DataFrames using their indexes. >>> df.join(other, lsuffix='_caller', rsuffix='_other') key_caller A key_other B 0 K0 A0 K0 B0 1 K1 A1 K1 B1 2 K2 A2 K2 B2 3 K3 A3 NaN NaN 4 K4 A4 NaN NaN 5 K5 A5 NaN NaN If we want to join using the key columns, we need to set key to be the index in both `df` and `other`. The joined DataFrame will have key as its index. >>> df.set_index('key').join(other.set_index('key')) A B key K0 A0 B0 K1 A1 B1 K2 A2 B2 K3 A3 NaN K4 A4 NaN K5 A5 NaN Another option to join using the key columns is to use the `on` parameter. DataFrame.join always uses `other`'s index but we can use any column in `df`. This method preserves the original DataFrame's index in the result. >>> df.join(other.set_index('key'), on='key') key A B 0 K0 A0 B0 1 K1 A1 B1 2 K2 A2 B2 3 K3 A3 NaN 4 K4 A4 NaN 5 K5 A5 NaN """ return self._join_compat( other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort )
def _join_compat( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False ): from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import merge if isinstance(other, Series): if other.name is None: raise ValueError("Other Series must have a name") other = DataFrame({other.name: other}) if isinstance(other, DataFrame): if how == "cross": return merge( self, other, how=how, on=on, suffixes=(lsuffix, rsuffix), sort=sort, ) return merge( self, other, left_on=on, how=how, left_index=on is None, right_index=True, suffixes=(lsuffix, rsuffix), sort=sort, ) else: if on is not None: raise ValueError( "Joining multiple DataFrames only supported for joining on index" ) frames = [self] + list(other) can_concat = all(df.index.is_unique for df in frames) # join indexes only using concat if can_concat: if how == "left": res = concat( frames, axis=1, join="outer", verify_integrity=True, sort=sort ) return res.reindex(self.index, copy=False) else: return concat( frames, axis=1, join=how, verify_integrity=True, sort=sort ) joined = frames[0] for frame in frames[1:]: joined = merge( joined, frame, how=how, left_index=True, right_index=True ) return joined
[docs] @Substitution("") @Appender(_merge_doc, indents=2) def merge( self, right, how="inner", on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=("_x", "_y"), copy=True, indicator=False, validate=None, ) -> DataFrame: from pandas.core.reshape.merge import merge return merge( self, right, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, sort=sort, suffixes=suffixes, copy=copy, indicator=indicator, validate=validate, )
[docs] def round(self, decimals=0, *args, **kwargs) -> DataFrame: """ Round a DataFrame to a variable number of decimal places. Parameters ---------- decimals : int, dict, Series Number of decimal places to round each column to. If an int is given, round each column to the same number of places. Otherwise dict and Series round to variable numbers of places. Column names should be in the keys if `decimals` is a dict-like, or in the index if `decimals` is a Series. Any columns not included in `decimals` will be left as is. Elements of `decimals` which are not columns of the input will be ignored. *args Additional keywords have no effect but might be accepted for compatibility with numpy. **kwargs Additional keywords have no effect but might be accepted for compatibility with numpy. Returns ------- DataFrame A DataFrame with the affected columns rounded to the specified number of decimal places. See Also -------- numpy.around : Round a numpy array to the given number of decimals. Series.round : Round a Series to the given number of decimals. Examples -------- >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], ... columns=['dogs', 'cats']) >>> df dogs cats 0 0.21 0.32 1 0.01 0.67 2 0.66 0.03 3 0.21 0.18 By providing an integer each column is rounded to the same number of decimal places >>> df.round(1) dogs cats 0 0.2 0.3 1 0.0 0.7 2 0.7 0.0 3 0.2 0.2 With a dict, the number of places for specific columns can be specified with the column names as key and the number of decimal places as value >>> df.round({'dogs': 1, 'cats': 0}) dogs cats 0 0.2 0.0 1 0.0 1.0 2 0.7 0.0 3 0.2 0.0 Using a Series, the number of places for specific columns can be specified with the column names as index and the number of decimal places as value >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) >>> df.round(decimals) dogs cats 0 0.2 0.0 1 0.0 1.0 2 0.7 0.0 3 0.2 0.0 """ from pandas.core.reshape.concat import concat def _dict_round(df, decimals): for col, vals in df.items(): try: yield _series_round(vals, decimals[col]) except KeyError: yield vals def _series_round(s, decimals): if is_integer_dtype(s) or is_float_dtype(s): return s.round(decimals) return s nv.validate_round(args, kwargs) if isinstance(decimals, (dict, Series)): if isinstance(decimals, Series): if not decimals.index.is_unique: raise ValueError("Index of decimals must be unique") new_cols = list(_dict_round(self, decimals)) elif is_integer(decimals): # Dispatch to Series.round new_cols = [_series_round(v, decimals) for _, v in self.items()] else: raise TypeError("decimals must be an integer, a dict-like or a Series") if len(new_cols) > 0: return self._constructor( concat(new_cols, axis=1), index=self.index, columns=self.columns ) else: return self
# ---------------------------------------------------------------------- # Statistical methods, etc.
[docs] def corr(self, method="pearson", min_periods=1) -> DataFrame: """ Compute pairwise correlation of columns, excluding NA/null values. Parameters ---------- method : {'pearson', 'kendall', 'spearman'} or callable Method of correlation: * pearson : standard correlation coefficient * kendall : Kendall Tau correlation coefficient * spearman : Spearman rank correlation * callable: callable with input two 1d ndarrays and returning a float. Note that the returned matrix from corr will have 1 along the diagonals and will be symmetric regardless of the callable's behavior. .. versionadded:: 0.24.0 min_periods : int, optional Minimum number of observations required per pair of columns to have a valid result. Currently only available for Pearson and Spearman correlation. Returns ------- DataFrame Correlation matrix. See Also -------- DataFrame.corrwith : Compute pairwise correlation with another DataFrame or Series. Series.corr : Compute the correlation between two Series. Examples -------- >>> def histogram_intersection(a, b): ... v = np.minimum(a, b).sum().round(decimals=1) ... return v >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], ... columns=['dogs', 'cats']) >>> df.corr(method=histogram_intersection) dogs cats dogs 1.0 0.3 cats 0.3 1.0 """ numeric_df = self._get_numeric_data() cols = numeric_df.columns idx = cols.copy() mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False) if method == "pearson": correl = libalgos.nancorr(mat, minp=min_periods) elif method == "spearman": correl = libalgos.nancorr_spearman(mat, minp=min_periods) elif method == "kendall" or callable(method): if min_periods is None: min_periods = 1 mat = mat.T corrf = nanops.get_corr_func(method) K = len(cols) correl = np.empty((K, K), dtype=float) mask = np.isfinite(mat) for i, ac in enumerate(mat): for j, bc in enumerate(mat): if i > j: continue valid = mask[i] & mask[j] if valid.sum() < min_periods: c = np.nan elif i == j: c = 1.0 elif not valid.all(): c = corrf(ac[valid], bc[valid]) else: c = corrf(ac, bc) correl[i, j] = c correl[j, i] = c else: raise ValueError( "method must be either 'pearson', " "'spearman', 'kendall', or a callable, " f"'{method}' was supplied" ) return self._constructor(correl, index=idx, columns=cols)
[docs] def cov( self, min_periods: Optional[int] = None, ddof: Optional[int] = 1 ) -> DataFrame: """ Compute pairwise covariance of columns, excluding NA/null values. Compute the pairwise covariance among the series of a DataFrame. The returned data frame is the `covariance matrix <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns of the DataFrame. Both NA and null values are automatically excluded from the calculation. (See the note below about bias from missing values.) A threshold can be set for the minimum number of observations for each value created. Comparisons with observations below this threshold will be returned as ``NaN``. This method is generally used for the analysis of time series data to understand the relationship between different measures across time. Parameters ---------- min_periods : int, optional Minimum number of observations required per pair of columns to have a valid result. ddof : int, default 1 Delta degrees of freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. .. versionadded:: 1.1.0 Returns ------- DataFrame The covariance matrix of the series of the DataFrame. See Also -------- Series.cov : Compute covariance with another Series. core.window.ExponentialMovingWindow.cov: Exponential weighted sample covariance. core.window.Expanding.cov : Expanding sample covariance. core.window.Rolling.cov : Rolling sample covariance. Notes ----- Returns the covariance matrix of the DataFrame's time series. The covariance is normalized by N-ddof. For DataFrames that have Series that are missing data (assuming that data is `missing at random <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__) the returned covariance matrix will be an unbiased estimate of the variance and covariance between the member Series. However, for many applications this estimate may not be acceptable because the estimate covariance matrix is not guaranteed to be positive semi-definite. This could lead to estimate correlations having absolute values which are greater than one, and/or a non-invertible covariance matrix. See `Estimation of covariance matrices <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_ matrices>`__ for more details. Examples -------- >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], ... columns=['dogs', 'cats']) >>> df.cov() dogs cats dogs 0.666667 -1.000000 cats -1.000000 1.666667 >>> np.random.seed(42) >>> df = pd.DataFrame(np.random.randn(1000, 5), ... columns=['a', 'b', 'c', 'd', 'e']) >>> df.cov() a b c d e a 0.998438 -0.020161 0.059277 -0.008943 0.014144 b -0.020161 1.059352 -0.008543 -0.024738 0.009826 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 e 0.014144 0.009826 -0.000271 -0.013692 0.977795 **Minimum number of periods** This method also supports an optional ``min_periods`` keyword that specifies the required minimum number of non-NA observations for each column pair in order to have a valid result: >>> np.random.seed(42) >>> df = pd.DataFrame(np.random.randn(20, 3), ... columns=['a', 'b', 'c']) >>> df.loc[df.index[:5], 'a'] = np.nan >>> df.loc[df.index[5:10], 'b'] = np.nan >>> df.cov(min_periods=12) a b c a 0.316741 NaN -0.150812 b NaN 1.248003 0.191417 c -0.150812 0.191417 0.895202 """ numeric_df = self._get_numeric_data() cols = numeric_df.columns idx = cols.copy() mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False) if notna(mat).all(): if min_periods is not None and min_periods > len(mat): base_cov = np.empty((mat.shape[1], mat.shape[1])) base_cov.fill(np.nan) else: base_cov = np.cov(mat.T, ddof=ddof) base_cov = base_cov.reshape((len(cols), len(cols))) else: base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) return self._constructor(base_cov, index=idx, columns=cols)
[docs] def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series: """ Compute pairwise correlation. Pairwise correlation is computed between rows or columns of DataFrame with rows or columns of Series or DataFrame. DataFrames are first aligned along both axes before computing the correlations. Parameters ---------- other : DataFrame, Series Object with which to compute correlations. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise. drop : bool, default False Drop missing indices from result. method : {'pearson', 'kendall', 'spearman'} or callable Method of correlation: * pearson : standard correlation coefficient * kendall : Kendall Tau correlation coefficient * spearman : Spearman rank correlation * callable: callable with input two 1d ndarrays and returning a float. .. versionadded:: 0.24.0 Returns ------- Series Pairwise correlations. See Also -------- DataFrame.corr : Compute pairwise correlation of columns. """ axis = self._get_axis_number(axis) this = self._get_numeric_data() if isinstance(other, Series): return this.apply(lambda x: other.corr(x, method=method), axis=axis) other = other._get_numeric_data() left, right = this.align(other, join="inner", copy=False) if axis == 1: left = left.T right = right.T if method == "pearson": # mask missing values left = left + right * 0 right = right + left * 0 # demeaned data ldem = left - left.mean() rdem = right - right.mean() num = (ldem * rdem).sum() dom = (left.count() - 1) * left.std() * right.std() correl = num / dom elif method in ["kendall", "spearman"] or callable(method): def c(x): return nanops.nancorr(x[0], x[1], method=method) correl = self._constructor_sliced( map(c, zip(left.values.T, right.values.T)), index=left.columns ) else: raise ValueError( f"Invalid method {method} was passed, " "valid methods are: 'pearson', 'kendall', " "'spearman', or callable" ) if not drop: # Find non-matching labels along the given axis # and append missing correlations (GH 22375) raxis = 1 if axis == 0 else 0 result_index = this._get_axis(raxis).union(other._get_axis(raxis)) idx_diff = result_index.difference(correl.index) if len(idx_diff) > 0: correl = correl.append(Series([np.nan] * len(idx_diff), index=idx_diff)) return correl
# ---------------------------------------------------------------------- # ndarray-like stats methods
[docs] def count(self, axis=0, level=None, numeric_only=False): """ Count non-NA cells for each column or row. The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending on `pandas.options.mode.use_inf_as_na`) are considered NA. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 If 0 or 'index' counts are generated for each column. If 1 or 'columns' counts are generated for each row. level : int or str, optional If the axis is a `MultiIndex` (hierarchical), count along a particular `level`, collapsing into a `DataFrame`. A `str` specifies the level name. numeric_only : bool, default False Include only `float`, `int` or `boolean` data. Returns ------- Series or DataFrame For each column/row the number of non-NA/null entries. If `level` is specified returns a `DataFrame`. See Also -------- Series.count: Number of non-NA elements in a Series. DataFrame.value_counts: Count unique combinations of columns. DataFrame.shape: Number of DataFrame rows and columns (including NA elements). DataFrame.isna: Boolean same-sized DataFrame showing places of NA elements. Examples -------- Constructing DataFrame from a dictionary: >>> df = pd.DataFrame({"Person": ... ["John", "Myla", "Lewis", "John", "Myla"], ... "Age": [24., np.nan, 21., 33, 26], ... "Single": [False, True, True, True, False]}) >>> df Person Age Single 0 John 24.0 False 1 Myla NaN True 2 Lewis 21.0 True 3 John 33.0 True 4 Myla 26.0 False Notice the uncounted NA values: >>> df.count() Person 5 Age 4 Single 5 dtype: int64 Counts for each **row**: >>> df.count(axis='columns') 0 3 1 2 2 3 3 3 4 3 dtype: int64 Counts for one level of a `MultiIndex`: >>> df.set_index(["Person", "Single"]).count(level="Person") Age Person John 2 Lewis 1 Myla 1 """ axis = self._get_axis_number(axis) if level is not None: return self._count_level(level, axis=axis, numeric_only=numeric_only) if numeric_only: frame = self._get_numeric_data() else: frame = self # GH #423 if len(frame._get_axis(axis)) == 0: result = self._constructor_sliced(0, index=frame._get_agg_axis(axis)) else: if frame._is_mixed_type or frame._mgr.any_extension_types: # the or any_extension_types is really only hit for single- # column frames with an extension array result = notna(frame).sum(axis=axis) else: # GH13407 series_counts = notna(frame).sum(axis=axis) counts = series_counts.values result = self._constructor_sliced( counts, index=frame._get_agg_axis(axis) ) return result.astype("int64")
def _count_level(self, level, axis=0, numeric_only=False): if numeric_only: frame = self._get_numeric_data() else: frame = self count_axis = frame._get_axis(axis) agg_axis = frame._get_agg_axis(axis) if not isinstance(count_axis, MultiIndex): raise TypeError( f"Can only count levels on hierarchical {self._get_axis_name(axis)}." ) # Mask NaNs: Mask rows or columns where the index level is NaN, and all # values in the DataFrame that are NaN if frame._is_mixed_type: # Since we have mixed types, calling notna(frame.values) might # upcast everything to object values_mask = notna(frame).values else: # But use the speedup when we have homogeneous dtypes values_mask = notna(frame.values) index_mask = notna(count_axis.get_level_values(level=level)) if axis == 1: mask = index_mask & values_mask else: mask = index_mask.reshape(-1, 1) & values_mask if isinstance(level, str): level = count_axis._get_level_number(level) level_name = count_axis._names[level] level_index = count_axis.levels[level]._shallow_copy(name=level_name) level_codes = ensure_int64(count_axis.codes[level]) counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis) if axis == 1: result = self._constructor(counts, index=agg_axis, columns=level_index) else: result = self._constructor(counts, index=level_index, columns=agg_axis) return result def _reduce( self, op, name: str, *, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds, ): assert filter_type is None or filter_type == "bool", filter_type out_dtype = "bool" if filter_type == "bool" else None own_dtypes = [arr.dtype for arr in self._iter_column_arrays()] dtype_is_dt = np.array( [is_datetime64_any_dtype(dtype) for dtype in own_dtypes], dtype=bool, ) if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any(): warnings.warn( "DataFrame.mean and DataFrame.median with numeric_only=None " "will include datetime64 and datetime64tz columns in a " "future version.", FutureWarning, stacklevel=5, ) cols = self.columns[~dtype_is_dt] self = self[cols] # TODO: Make other agg func handle axis=None properly GH#21597 axis = self._get_axis_number(axis) labels = self._get_agg_axis(axis) assert axis in [0, 1] def func(values: np.ndarray): # We only use this in the case that operates on self.values return op(values, axis=axis, skipna=skipna, **kwds) def blk_func(values): if isinstance(values, ExtensionArray): return values._reduce(name, skipna=skipna, **kwds) else: return op(values, axis=1, skipna=skipna, **kwds) def _get_data() -> DataFrame: if filter_type is None: data = self._get_numeric_data() else: # GH#25101, GH#24434 assert filter_type == "bool" data = self._get_bool_data() return data if numeric_only is not None or axis == 0: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object # dtypes are unambiguous can be handled with BlockManager.reduce # Case with EAs see GH#35881 df = self if numeric_only is True: df = _get_data() if axis == 1: df = df.T axis = 0 ignore_failures = numeric_only is None # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce res, indexer = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) out = df._constructor(res).iloc[0] if out_dtype is not None: out = out.astype(out_dtype) if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: # Even if we are object dtype, follow numpy and return # float64, see test_apply_funcs_over_empty out = out.astype(np.float64) return out assert numeric_only is None data = self values = data.values try: result = func(values) except TypeError: # e.g. in nanops trying to convert strs to float data = _get_data() labels = data._get_agg_axis(axis) values = data.values with np.errstate(all="ignore"): result = func(values) if filter_type == "bool" and notna(result).all(): result = result.astype(np.bool_) elif filter_type is None and is_object_dtype(result.dtype): try: result = result.astype(np.float64) except (ValueError, TypeError): # try to coerce to the original dtypes item by item if we can pass result = self._constructor_sliced(result, index=labels) return result
[docs] def nunique(self, axis=0, dropna=True) -> Series: """ Count distinct observations over requested axis. Return Series with number of distinct observations. Can ignore NaN values. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. dropna : bool, default True Don't include NaN in the counts. Returns ------- Series See Also -------- Series.nunique: Method nunique for Series. DataFrame.count: Count non-NA cells for each column or row. Examples -------- >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]}) >>> df.nunique() A 3 B 1 dtype: int64 >>> df.nunique(axis=1) 0 1 1 2 2 2 dtype: int64 """ return self.apply(Series.nunique, axis=axis, dropna=dropna)
[docs] def idxmin(self, axis=0, skipna=True) -> Series: """ Return index of first occurrence of minimum over requested axis. NA/null values are excluded. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. Returns ------- Series Indexes of minima along the specified axis. Raises ------ ValueError * If the row/column is empty See Also -------- Series.idxmin : Return index of the minimum element. Notes ----- This method is the DataFrame version of ``ndarray.argmin``. Examples -------- Consider a dataset containing food consumption in Argentina. >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], ... 'co2_emissions': [37.2, 19.66, 1712]}, ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions Pork 10.51 37.20 Wheat Products 103.11 19.66 Beef 55.48 1712.00 By default, it returns the index for the minimum value in each column. >>> df.idxmin() consumption Pork co2_emissions Wheat Products dtype: object To return the index for the minimum value in each row, use ``axis="columns"``. >>> df.idxmin(axis="columns") Pork consumption Wheat Products co2_emissions Beef consumption dtype: object """ axis = self._get_axis_number(axis) res = self._reduce( nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False ) indices = res._values # indices will always be np.ndarray since axis is not None and # values is a 2d array for DataFrame # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" assert isinstance(indices, np.ndarray) # for mypy index = self._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] return self._constructor_sliced(result, index=self._get_agg_axis(axis))
[docs] def idxmax(self, axis=0, skipna=True) -> Series: """ Return index of first occurrence of maximum over requested axis. NA/null values are excluded. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. Returns ------- Series Indexes of maxima along the specified axis. Raises ------ ValueError * If the row/column is empty See Also -------- Series.idxmax : Return index of the maximum element. Notes ----- This method is the DataFrame version of ``ndarray.argmax``. Examples -------- Consider a dataset containing food consumption in Argentina. >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], ... 'co2_emissions': [37.2, 19.66, 1712]}, ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions Pork 10.51 37.20 Wheat Products 103.11 19.66 Beef 55.48 1712.00 By default, it returns the index for the maximum value in each column. >>> df.idxmax() consumption Wheat Products co2_emissions Beef dtype: object To return the index for the maximum value in each row, use ``axis="columns"``. >>> df.idxmax(axis="columns") Pork co2_emissions Wheat Products consumption Beef co2_emissions dtype: object """ axis = self._get_axis_number(axis) res = self._reduce( nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False ) indices = res._values # indices will always be np.ndarray since axis is not None and # values is a 2d array for DataFrame # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" assert isinstance(indices, np.ndarray) # for mypy index = self._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] return self._constructor_sliced(result, index=self._get_agg_axis(axis))
def _get_agg_axis(self, axis_num: int) -> Index: """ Let's be explicit about this. """ if axis_num == 0: return self.columns elif axis_num == 1: return self.index else: raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
[docs] def mode(self, axis=0, numeric_only=False, dropna=True) -> DataFrame: """ Get the mode(s) of each element along the selected axis. The mode of a set of values is the value that appears most often. It can be multiple values. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to iterate over while searching for the mode: * 0 or 'index' : get mode of each column * 1 or 'columns' : get mode of each row. numeric_only : bool, default False If True, only apply to numeric columns. dropna : bool, default True Don't consider counts of NaN/NaT. .. versionadded:: 0.24.0 Returns ------- DataFrame The modes of each column or row. See Also -------- Series.mode : Return the highest frequency value in a Series. Series.value_counts : Return the counts of values in a Series. Examples -------- >>> df = pd.DataFrame([('bird', 2, 2), ... ('mammal', 4, np.nan), ... ('arthropod', 8, 0), ... ('bird', 2, np.nan)], ... index=('falcon', 'horse', 'spider', 'ostrich'), ... columns=('species', 'legs', 'wings')) >>> df species legs wings falcon bird 2 2.0 horse mammal 4 NaN spider arthropod 8 0.0 ostrich bird 2 NaN By default, missing values are not considered, and the mode of wings are both 0 and 2. Because the resulting DataFrame has two rows, the second row of ``species`` and ``legs`` contains ``NaN``. >>> df.mode() species legs wings 0 bird 2.0 0.0 1 NaN NaN 2.0 Setting ``dropna=False`` ``NaN`` values are considered and they can be the mode (like for wings). >>> df.mode(dropna=False) species legs wings 0 bird 2 NaN Setting ``numeric_only=True``, only the mode of numeric columns is computed, and columns of other types are ignored. >>> df.mode(numeric_only=True) legs wings 0 2.0 0.0 1 NaN 2.0 To compute the mode over columns and not rows, use the axis parameter: >>> df.mode(axis='columns', numeric_only=True) 0 1 falcon 2.0 NaN horse 4.0 NaN spider 0.0 8.0 ostrich 2.0 NaN """ data = self if not numeric_only else self._get_numeric_data() def f(s): return s.mode(dropna=dropna) return data.apply(f, axis=axis)
[docs] def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): """ Return values at the given quantile over requested axis. Parameters ---------- q : float or array-like, default 0.5 (50% quantile) Value between 0 <= q <= 1, the quantile(s) to compute. axis : {0, 1, 'index', 'columns'}, default 0 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. numeric_only : bool, default True If False, the quantile of datetime and timedelta data will be computed as well. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: * linear: `i + (j - i) * fraction`, where `fraction` is the fractional part of the index surrounded by `i` and `j`. * lower: `i`. * higher: `j`. * nearest: `i` or `j` whichever is nearest. * midpoint: (`i` + `j`) / 2. Returns ------- Series or DataFrame If ``q`` is an array, a DataFrame will be returned where the index is ``q``, the columns are the columns of self, and the values are the quantiles. If ``q`` is a float, a Series will be returned where the index is the columns of self and the values are the quantiles. See Also -------- core.window.Rolling.quantile: Rolling quantile. numpy.percentile: Numpy function to compute the percentile. Examples -------- >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), ... columns=['a', 'b']) >>> df.quantile(.1) a 1.3 b 3.7 Name: 0.1, dtype: float64 >>> df.quantile([.1, .5]) a b 0.1 1.3 3.7 0.5 2.5 55.0 Specifying `numeric_only=False` will also compute the quantile of datetime and timedelta data. >>> df = pd.DataFrame({'A': [1, 2], ... 'B': [pd.Timestamp('2010'), ... pd.Timestamp('2011')], ... 'C': [pd.Timedelta('1 days'), ... pd.Timedelta('2 days')]}) >>> df.quantile(0.5, numeric_only=False) A 1.5 B 2010-07-02 12:00:00 C 1 days 12:00:00 Name: 0.5, dtype: object """ validate_percentile(q) data = self._get_numeric_data() if numeric_only else self axis = self._get_axis_number(axis) is_transposed = axis == 1 if is_transposed: data = data.T if len(data.columns) == 0: # GH#23925 _get_numeric_data may have dropped all columns cols = Index([], name=self.columns.name) if is_list_like(q): return self._constructor([], index=q, columns=cols) return self._constructor_sliced([], index=cols, name=q, dtype=np.float64) result = data._mgr.quantile( qs=q, axis=1, interpolation=interpolation, transposed=is_transposed ) if result.ndim == 2: result = self._constructor(result) else: result = self._constructor_sliced(result, name=q) if is_transposed: result = result.T return result
[docs] def to_timestamp( self, freq=None, how: str = "start", axis: Axis = 0, copy: bool = True ) -> DataFrame: """ Cast to DatetimeIndex of timestamps, at *beginning* of period. Parameters ---------- freq : str, default frequency of PeriodIndex Desired frequency. how : {'s', 'e', 'start', 'end'} Convention for converting period to timestamp; start of period vs. end. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default). copy : bool, default True If False then underlying input data is not copied. Returns ------- DataFrame with DatetimeIndex """ new_obj = self.copy(deep=copy) axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) if not isinstance(old_ax, PeriodIndex): raise TypeError(f"unsupported Type {type(old_ax).__name__}") new_ax = old_ax.to_timestamp(freq=freq, how=how) setattr(new_obj, axis_name, new_ax) return new_obj
[docs] def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> DataFrame: """ Convert DataFrame from DatetimeIndex to PeriodIndex. Convert DataFrame from DatetimeIndex to PeriodIndex with desired frequency (inferred from index if not passed). Parameters ---------- freq : str, default Frequency of the PeriodIndex. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default). copy : bool, default True If False then underlying input data is not copied. Returns ------- DataFrame with PeriodIndex """ new_obj = self.copy(deep=copy) axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) if not isinstance(old_ax, DatetimeIndex): raise TypeError(f"unsupported Type {type(old_ax).__name__}") new_ax = old_ax.to_period(freq=freq) setattr(new_obj, axis_name, new_ax) return new_obj
[docs] def isin(self, values) -> DataFrame: """ Whether each element in the DataFrame is contained in values. Parameters ---------- values : iterable, Series, DataFrame or dict The result will only be true at a location if all the labels match. If `values` is a Series, that's the index. If `values` is a dict, the keys must be the column names, which must match. If `values` is a DataFrame, then both the index and column labels must match. Returns ------- DataFrame DataFrame of booleans showing whether each element in the DataFrame is contained in values. See Also -------- DataFrame.eq: Equality test for DataFrame. Series.isin: Equivalent method on Series. Series.str.contains: Test if pattern or regex is contained within a string of a Series or Index. Examples -------- >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, ... index=['falcon', 'dog']) >>> df num_legs num_wings falcon 2 2 dog 4 0 When ``values`` is a list check whether every value in the DataFrame is present in the list (which animals have 0 or 2 legs or wings) >>> df.isin([0, 2]) num_legs num_wings falcon True True dog False True When ``values`` is a dict, we can pass values to check for each column separately: >>> df.isin({'num_wings': [0, 3]}) num_legs num_wings falcon False False dog False True When ``values`` is a Series or DataFrame the index and column must match. Note that 'falcon' does not match based on the number of legs in df2. >>> other = pd.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]}, ... index=['spider', 'falcon']) >>> df.isin(other) num_legs num_wings falcon True True dog False False """ if isinstance(values, dict): from pandas.core.reshape.concat import concat values = collections.defaultdict(list, values) return concat( ( self.iloc[:, [i]].isin(values[col]) for i, col in enumerate(self.columns) ), axis=1, ) elif isinstance(values, Series): if not values.index.is_unique: raise ValueError("cannot compute isin with a duplicate axis.") return self.eq(values.reindex_like(self), axis="index") elif isinstance(values, DataFrame): if not (values.columns.is_unique and values.index.is_unique): raise ValueError("cannot compute isin with a duplicate axis.") return self.eq(values.reindex_like(self)) else: if not is_list_like(values): raise TypeError( "only list-like or dict-like objects are allowed " "to be passed to DataFrame.isin(), " f"you passed a '{type(values).__name__}'" ) return self._constructor( algorithms.isin(self.values.ravel(), values).reshape(self.shape), self.index, self.columns, )
# ---------------------------------------------------------------------- # Add index and columns _AXIS_ORDERS = ["index", "columns"] _AXIS_TO_AXIS_NUMBER: Dict[Axis, int] = { **NDFrame._AXIS_TO_AXIS_NUMBER, 1: 1, "columns": 1, } _AXIS_REVERSED = True _AXIS_LEN = len(_AXIS_ORDERS) _info_axis_number = 1 _info_axis_name = "columns" index: Index = properties.AxisProperty( axis=1, doc="The index (row labels) of the DataFrame." ) columns: Index = properties.AxisProperty( axis=0, doc="The column labels of the DataFrame." ) @property def _AXIS_NUMBERS(self) -> Dict[str, int]: """.. deprecated:: 1.1.0""" super()._AXIS_NUMBERS return {"index": 0, "columns": 1} @property def _AXIS_NAMES(self) -> Dict[int, str]: """.. deprecated:: 1.1.0""" super()._AXIS_NAMES return {0: "index", 1: "columns"} # ---------------------------------------------------------------------- # Add plotting methods to DataFrame plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) hist = pandas.plotting.hist_frame boxplot = pandas.plotting.boxplot_frame sparse = CachedAccessor("sparse", SparseFrameAccessor)
DataFrame._add_numeric_operations() ops.add_flex_arithmetic_methods(DataFrame) def _from_nested_dict(data) -> collections.defaultdict: new_data: collections.defaultdict = collections.defaultdict(dict) for index, s in data.items(): for col, v in s.items(): new_data[col][index] = v return new_data