--- /srv/reproducible-results/rbuild-debian/r-b-build.6brRH3xl/b1/pandas_2.2.2+dfsg-4_arm64.changes
+++ /srv/reproducible-results/rbuild-debian/r-b-build.6brRH3xl/b2/pandas_2.2.2+dfsg-4_arm64.changes
├── Files
│ @@ -1,5 +1,5 @@
│
│ - 2c7a4bf14a4a26e13710ad68a9482842 10491700 doc optional python-pandas-doc_2.2.2+dfsg-4_all.deb
│ - 63b97a9f7242be30dfe1cef1a6e57cea 35712496 debug optional python3-pandas-lib-dbgsym_2.2.2+dfsg-4_arm64.deb
│ - cb7f29e1b171ccb1c4b2a3315bbb57f6 3727912 python optional python3-pandas-lib_2.2.2+dfsg-4_arm64.deb
│ + 099d941583f57fcbb1cc1a602fc305c6 10492516 doc optional python-pandas-doc_2.2.2+dfsg-4_all.deb
│ + 870243fde85c0731fcff25659cea3590 35712356 debug optional python3-pandas-lib-dbgsym_2.2.2+dfsg-4_arm64.deb
│ + 4da540ede26207b6fce95c4ab7e503e6 3727556 python optional python3-pandas-lib_2.2.2+dfsg-4_arm64.deb
│ cd2f98205ce377f7af776b3e5c9564dd 3095732 python optional python3-pandas_2.2.2+dfsg-4_all.deb
├── python-pandas-doc_2.2.2+dfsg-4_all.deb
│ ├── file list
│ │ @@ -1,3 +1,3 @@
│ │ -rw-r--r-- 0 0 0 4 2024-07-07 18:36:37.000000 debian-binary
│ │ --rw-r--r-- 0 0 0 147520 2024-07-07 18:36:37.000000 control.tar.xz
│ │ --rw-r--r-- 0 0 0 10343988 2024-07-07 18:36:37.000000 data.tar.xz
│ │ +-rw-r--r-- 0 0 0 147580 2024-07-07 18:36:37.000000 control.tar.xz
│ │ +-rw-r--r-- 0 0 0 10344744 2024-07-07 18:36:37.000000 data.tar.xz
│ ├── control.tar.xz
│ │ ├── control.tar
│ │ │ ├── ./control
│ │ │ │ @@ -1,13 +1,13 @@
│ │ │ │ Package: python-pandas-doc
│ │ │ │ Source: pandas
│ │ │ │ Version: 2.2.2+dfsg-4
│ │ │ │ Architecture: all
│ │ │ │ Maintainer: Debian Science Team This has improved the performance compared to the pure Python approach by one-third. We can annotate the function variables and return types as well as use Annotating the functions with C types yields an over ten times performance improvement compared to
│ │ │ │ the original Python implementation. When re-profiling, time is spent creating a This implementation creates an array of zeros and inserts the result
│ │ │ │ of Since Performance has improved from the prior implementation by almost ten times. The majority of the time is now spent in However, a loop indexer In [9]: %timeit df.apply(lambda x: integrate_f_plain(x["a"], x["b"], x["N"]), axis=1)
│ │ │ │ -170 ms +- 23.5 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +72.6 ms +- 56.6 us per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │
Declaring C types¶
│ │ │ │ cdef
│ │ │ │ @@ -595,34 +595,34 @@
│ │ │ │ ....: for i in range(N):
│ │ │ │ ....: s += f_typed(a + i * dx)
│ │ │ │ ....: return s * dx
│ │ │ │ ....:
│ │ │ │
│ │ │ │
│ │ │ │ In [11]: %timeit df.apply(lambda x: integrate_f_typed(x["a"], x["b"], x["N"]), axis=1)
│ │ │ │ -18 ms +- 2.14 ms per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ +8.91 ms +- 10.5 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │
Using ndarray¶
│ │ │ │ Series
from each row, and calling __getitem__
from both
│ │ │ │ the index and the series (three times for each row). These Python function calls are expensive and
│ │ │ │ can be improved by passing an np.ndarray
.In [12]: %prun -l 4 df.apply(lambda x: integrate_f_typed(x["a"], x["b"], x["N"]), axis=1)
│ │ │ │ - 52528 function calls (52510 primitive calls) in 0.024 seconds
│ │ │ │ + 52528 function calls (52510 primitive calls) in 0.023 seconds
│ │ │ │
│ │ │ │ Ordered by: internal time
│ │ │ │ List reduced from 157 to 4 due to restriction <4>
│ │ │ │
│ │ │ │ ncalls tottime percall cumtime percall filename:lineno(function)
│ │ │ │ - 3000 0.004 0.000 0.015 0.000 series.py:1095(__getitem__)
│ │ │ │ - 3000 0.003 0.000 0.007 0.000 series.py:1220(_get_value)
│ │ │ │ + 3000 0.004 0.000 0.014 0.000 series.py:1095(__getitem__)
│ │ │ │ + 3000 0.003 0.000 0.006 0.000 series.py:1220(_get_value)
│ │ │ │ 16098 0.002 0.000 0.003 0.000 {built-in method builtins.isinstance}
│ │ │ │ 3000 0.002 0.000 0.002 0.000 base.py:3777(get_loc)
│ │ │ │
In [13]: %%cython
│ │ │ │ ....: cimport numpy as np
│ │ │ │ ....: import numpy as np
│ │ │ │ @@ -659,15 +659,15 @@
│ │ │ │
integrate_f_typed
applied over each row. Looping over an ndarray
is faster
│ │ │ │ in Cython than looping over a Series
object.apply_integrate_f
is typed to accept an np.ndarray
, Series.to_numpy()
│ │ │ │ calls are needed to utilize this function.In [14]: %timeit apply_integrate_f(df["a"].to_numpy(), df["b"].to_numpy(), df["N"].to_numpy())
│ │ │ │ -1.77 ms +- 287 us per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ +1.17 ms +- 452 ns per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │
Disabling compiler directives¶
│ │ │ │ apply_integrate_f
. Disabling Cython’s boundscheck
│ │ │ │ @@ -676,16 +676,16 @@
│ │ │ │ 78 function calls in 0.001 seconds
│ │ │ │
│ │ │ │ Ordered by: internal time
│ │ │ │ List reduced from 21 to 4 due to restriction <4>
│ │ │ │
│ │ │ │ ncalls tottime percall cumtime percall filename:lineno(function)
│ │ │ │ 1 0.001 0.001 0.001 0.001 <string>:1(<module>)
│ │ │ │ - 1 0.000 0.000 0.001 0.001 {built-in method builtins.exec}
│ │ │ │ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
│ │ │ │ + 1 0.000 0.000 0.001 0.001 {built-in method builtins.exec}
│ │ │ │ 3 0.000 0.000 0.000 0.000 frame.py:4062(__getitem__)
│ │ │ │ In [16]: %%cython
│ │ │ │ ....: cimport cython
│ │ │ │ ....: cimport numpy as np
│ │ │ │ ....: import numpy as np
│ │ │ │ @@ -719,15 +719,15 @@
│ │ │ │ from /build/reproducible-path/pandas-2.2.2+dfsg/buildtmp/.cache/ipython/cython/_cython_magic_6e544448a5b49fdd9edaaa8b35e916ee98fab35c.c:1251:
│ │ │ │ /usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
│ │ │ │ 17 | #warning "Using deprecated NumPy API, disable it with " \
│ │ │ │ | ^~~~~~~
│ │ │ │
In [17]: %timeit apply_integrate_f_wrap(df["a"].to_numpy(), df["b"].to_numpy(), df["N"].to_numpy())
│ │ │ │ -1.4 ms +- 208 us per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ +815 us +- 254 ns per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │
i
accessing an invalid location in an array would cause a segfault because memory access isn’t checked.
│ │ │ │ For more about boundscheck
and wraparound
, see the Cython docs on
│ │ │ │ compiler directives.DataFrame
. This engine requires the
│ │ │ │ optional dependency numexpr
to be installed.
The 'python'
engine is generally not useful except for testing
│ │ │ │ other evaluation engines against it. You will achieve no performance
│ │ │ │ benefits using eval()
with engine='python'
and may
│ │ │ │ incur a performance hit.
In [40]: %timeit df1 + df2 + df3 + df4
│ │ │ │ -35 ms +- 7.49 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +8.79 ms +- 201 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │
In [41]: %timeit pd.eval("df1 + df2 + df3 + df4", engine="python")
│ │ │ │ -35.6 ms +- 4.31 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +11.4 ms +- 75.5 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │
DataFrame.eval()
method¶In addition to the top level pandas.eval()
function you can also
│ │ │ │ evaluate an expression in the “context” of a DataFrame
.
In [58]: nrows, ncols = 20000, 100
│ │ │ │
│ │ │ │ In [59]: df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for _ in range(4)]
│ │ │ │
DataFrame
arithmetic:
In [60]: %timeit df1 + df2 + df3 + df4
│ │ │ │ -38.5 ms +- 4.88 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +8.96 ms +- 77.9 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │
In [61]: %timeit pd.eval("df1 + df2 + df3 + df4")
│ │ │ │ -13.2 ms +- 578 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ +6.77 ms +- 36.1 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │
DataFrame
comparison:
In [62]: %timeit (df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)
│ │ │ │ -41.5 ms +- 2 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +11 ms +- 165 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │
In [63]: %timeit pd.eval("(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)")
│ │ │ │ -17.1 ms +- 749 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ +11 ms +- 255 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │
DataFrame
arithmetic with unaligned axes.
In [64]: s = pd.Series(np.random.randn(50))
│ │ │ │
│ │ │ │ In [65]: %timeit df1 + df2 + df3 + df4 + s
│ │ │ │ -59.8 ms +- 7.17 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +20.3 ms +- 129 us per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │
In [66]: %timeit pd.eval("df1 + df2 + df3 + df4 + s")
│ │ │ │ -13.2 ms +- 433 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ +8.86 ms +- 81 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │
Note
│ │ │ │Operations such as
│ │ │ │1 and 2 # would parse to 1 & 2, but should evaluate to 2
│ │ │ │ 3 or 4 # would parse to 3 | 4, but should evaluate to 3
│ │ │ │ ├── html2text {}
│ │ │ │ │ @@ -113,32 +113,32 @@
│ │ │ │ │ ...: dx = (b - a) / N
│ │ │ │ │ ...: for i in range(N):
│ │ │ │ │ ...: s += f(a + i * dx)
│ │ │ │ │ ...: return s * dx
│ │ │ │ │ ...:
│ │ │ │ │ We achieve our result by using _D_a_t_a_F_r_a_m_e_._a_p_p_l_y_(_) (row-wise):
│ │ │ │ │ In [5]: %timeit df.apply(lambda x: integrate_f(x["a"], x["b"], x["N"]), axis=1)
│ │ │ │ │ -141 ms +- 35 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +79.1 ms +- 4.2 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ Let’s take a look and see where the time is spent during this operation using
│ │ │ │ │ the _p_r_u_n_ _i_p_y_t_h_o_n_ _m_a_g_i_c_ _f_u_n_c_t_i_o_n:
│ │ │ │ │ # most time consuming 4 calls
│ │ │ │ │ In [6]: %prun -l 4 df.apply(lambda x: integrate_f(x["a"], x["b"], x["N"]),
│ │ │ │ │ axis=1) # noqa E999
│ │ │ │ │ - 605951 function calls (605933 primitive calls) in 0.350 seconds
│ │ │ │ │ + 605951 function calls (605933 primitive calls) in 0.189 seconds
│ │ │ │ │
│ │ │ │ │ Ordered by: internal time
│ │ │ │ │ List reduced from 159 to 4 due to restriction <4>
│ │ │ │ │
│ │ │ │ │ ncalls tottime percall cumtime percall filename:lineno(function)
│ │ │ │ │ - 1000 0.201 0.000 0.299 0.000 :1
│ │ │ │ │ + 1000 0.112 0.000 0.166 0.000 :1
│ │ │ │ │ (integrate_f)
│ │ │ │ │ - 552423 0.098 0.000 0.098 0.000 :1
│ │ │ │ │ + 552423 0.054 0.000 0.054 0.000 :1
│ │ │ │ │ (f)
│ │ │ │ │ - 3000 0.008 0.000 0.032 0.000 series.py:1095(__getitem__)
│ │ │ │ │ - 3000 0.007 0.000 0.016 0.000 series.py:1220(_get_value)
│ │ │ │ │ + 3000 0.004 0.000 0.015 0.000 series.py:1095(__getitem__)
│ │ │ │ │ + 3000 0.003 0.000 0.007 0.000 series.py:1220(_get_value)
│ │ │ │ │ By far the majority of time is spend inside either integrate_f or f, hence
│ │ │ │ │ we’ll concentrate our efforts cythonizing these two functions.
│ │ │ │ │ ******** PPllaaiinn CCyytthhoonn_?¶ ********
│ │ │ │ │ First we’re going to need to import the Cython magic function to IPython:
│ │ │ │ │ In [7]: %load_ext Cython
│ │ │ │ │ Now, let’s simply copy our functions over to Cython:
│ │ │ │ │ In [8]: %%cython
│ │ │ │ │ @@ -149,15 +149,15 @@
│ │ │ │ │ ...: dx = (b - a) / N
│ │ │ │ │ ...: for i in range(N):
│ │ │ │ │ ...: s += f_plain(a + i * dx)
│ │ │ │ │ ...: return s * dx
│ │ │ │ │ ...:
│ │ │ │ │ In [9]: %timeit df.apply(lambda x: integrate_f_plain(x["a"], x["b"], x["N"]),
│ │ │ │ │ axis=1)
│ │ │ │ │ -170 ms +- 23.5 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +72.6 ms +- 56.6 us per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ This has improved the performance compared to the pure Python approach by one-
│ │ │ │ │ third.
│ │ │ │ │ ******** DDeeccllaarriinngg CC ttyyppeess_?¶ ********
│ │ │ │ │ We can annotate the function variables and return types as well as use cdef and
│ │ │ │ │ cpdef to improve performance:
│ │ │ │ │ In [10]: %%cython
│ │ │ │ │ ....: cdef double f_typed(double x) except? -2:
│ │ │ │ │ @@ -169,32 +169,32 @@
│ │ │ │ │ ....: dx = (b - a) / N
│ │ │ │ │ ....: for i in range(N):
│ │ │ │ │ ....: s += f_typed(a + i * dx)
│ │ │ │ │ ....: return s * dx
│ │ │ │ │ ....:
│ │ │ │ │ In [11]: %timeit df.apply(lambda x: integrate_f_typed(x["a"], x["b"], x["N"]),
│ │ │ │ │ axis=1)
│ │ │ │ │ -18 ms +- 2.14 ms per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ +8.91 ms +- 10.5 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ Annotating the functions with C types yields an over ten times performance
│ │ │ │ │ improvement compared to the original Python implementation.
│ │ │ │ │ ******** UUssiinngg nnddaarrrraayy_?¶ ********
│ │ │ │ │ When re-profiling, time is spent creating a _S_e_r_i_e_s from each row, and calling
│ │ │ │ │ __getitem__ from both the index and the series (three times for each row).
│ │ │ │ │ These Python function calls are expensive and can be improved by passing an
│ │ │ │ │ np.ndarray.
│ │ │ │ │ In [12]: %prun -l 4 df.apply(lambda x: integrate_f_typed(x["a"], x["b"], x
│ │ │ │ │ ["N"]), axis=1)
│ │ │ │ │ - 52528 function calls (52510 primitive calls) in 0.024 seconds
│ │ │ │ │ + 52528 function calls (52510 primitive calls) in 0.023 seconds
│ │ │ │ │
│ │ │ │ │ Ordered by: internal time
│ │ │ │ │ List reduced from 157 to 4 due to restriction <4>
│ │ │ │ │
│ │ │ │ │ ncalls tottime percall cumtime percall filename:lineno(function)
│ │ │ │ │ - 3000 0.004 0.000 0.015 0.000 series.py:1095(__getitem__)
│ │ │ │ │ - 3000 0.003 0.000 0.007 0.000 series.py:1220(_get_value)
│ │ │ │ │ + 3000 0.004 0.000 0.014 0.000 series.py:1095(__getitem__)
│ │ │ │ │ + 3000 0.003 0.000 0.006 0.000 series.py:1220(_get_value)
│ │ │ │ │ 16098 0.002 0.000 0.003 0.000 {built-in method
│ │ │ │ │ builtins.isinstance}
│ │ │ │ │ 3000 0.002 0.000 0.002 0.000 base.py:3777(get_loc)
│ │ │ │ │ In [13]: %%cython
│ │ │ │ │ ....: cimport numpy as np
│ │ │ │ │ ....: import numpy as np
│ │ │ │ │ ....: cdef double f_typed(double x) except? -2:
│ │ │ │ │ @@ -238,31 +238,31 @@
│ │ │ │ │ This implementation creates an array of zeros and inserts the result of
│ │ │ │ │ integrate_f_typed applied over each row. Looping over an ndarray is faster in
│ │ │ │ │ Cython than looping over a _S_e_r_i_e_s object.
│ │ │ │ │ Since apply_integrate_f is typed to accept an np.ndarray, _S_e_r_i_e_s_._t_o___n_u_m_p_y_(_)
│ │ │ │ │ calls are needed to utilize this function.
│ │ │ │ │ In [14]: %timeit apply_integrate_f(df["a"].to_numpy(), df["b"].to_numpy(), df
│ │ │ │ │ ["N"].to_numpy())
│ │ │ │ │ -1.77 ms +- 287 us per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ │ +1.17 ms +- 452 ns per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ │ Performance has improved from the prior implementation by almost ten times.
│ │ │ │ │ ******** DDiissaabblliinngg ccoommppiilleerr ddiirreeccttiivveess_?¶ ********
│ │ │ │ │ The majority of the time is now spent in apply_integrate_f. Disabling Cython’s
│ │ │ │ │ boundscheck and wraparound checks can yield more performance.
│ │ │ │ │ In [15]: %prun -l 4 apply_integrate_f(df["a"].to_numpy(), df["b"].to_numpy(),
│ │ │ │ │ df["N"].to_numpy())
│ │ │ │ │ 78 function calls in 0.001 seconds
│ │ │ │ │
│ │ │ │ │ Ordered by: internal time
│ │ │ │ │ List reduced from 21 to 4 due to restriction <4>
│ │ │ │ │
│ │ │ │ │ ncalls tottime percall cumtime percall filename:lineno(function)
│ │ │ │ │ 1 0.001 0.001 0.001 0.001 :1()
│ │ │ │ │ - 1 0.000 0.000 0.001 0.001 {built-in method builtins.exec}
│ │ │ │ │ 1 0.000 0.000 0.000 0.000 {method 'disable' of
│ │ │ │ │ '_lsprof.Profiler' objects}
│ │ │ │ │ + 1 0.000 0.000 0.001 0.001 {built-in method builtins.exec}
│ │ │ │ │ 3 0.000 0.000 0.000 0.000 frame.py:4062(__getitem__)
│ │ │ │ │ In [16]: %%cython
│ │ │ │ │ ....: cimport cython
│ │ │ │ │ ....: cimport numpy as np
│ │ │ │ │ ....: import numpy as np
│ │ │ │ │ ....: cdef np.float64_t f_typed(np.float64_t x) except? -2:
│ │ │ │ │ ....: return x * (x - 1)
│ │ │ │ │ @@ -301,15 +301,15 @@
│ │ │ │ │ /usr/lib/python3/dist-packages/numpy/core/include/numpy/
│ │ │ │ │ npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API,
│ │ │ │ │ disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
│ │ │ │ │ 17 | #warning "Using deprecated NumPy API, disable it with " \
│ │ │ │ │ | ^~~~~~~
│ │ │ │ │ In [17]: %timeit apply_integrate_f_wrap(df["a"].to_numpy(), df["b"].to_numpy(),
│ │ │ │ │ df["N"].to_numpy())
│ │ │ │ │ -1.4 ms +- 208 us per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ │ +815 us +- 254 ns per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ │ However, a loop indexer i accessing an invalid location in an array would cause
│ │ │ │ │ a segfault because memory access isn’t checked. For more about boundscheck and
│ │ │ │ │ wraparound, see the Cython docs on _c_o_m_p_i_l_e_r_ _d_i_r_e_c_t_i_v_e_s.
│ │ │ │ │ ********** NNuummbbaa ((JJIITT ccoommppiillaattiioonn))_?¶ **********
│ │ │ │ │ An alternative to statically compiling Cython code is to use a dynamic just-in-
│ │ │ │ │ time (JIT) compiler with _N_u_m_b_a.
│ │ │ │ │ Numba allows you to write a pure Python function which can be JIT compiled to
│ │ │ │ │ @@ -612,17 +612,17 @@
│ │ │ │ │ The 'numexpr' engine is the more performant engine that can yield performance
│ │ │ │ │ improvements compared to standard Python syntax for large _D_a_t_a_F_r_a_m_e. This
│ │ │ │ │ engine requires the optional dependency numexpr to be installed.
│ │ │ │ │ The 'python' engine is generally nnoott useful except for testing other evaluation
│ │ │ │ │ engines against it. You will achieve nnoo performance benefits using _e_v_a_l_(_) with
│ │ │ │ │ engine='python' and may incur a performance hit.
│ │ │ │ │ In [40]: %timeit df1 + df2 + df3 + df4
│ │ │ │ │ -35 ms +- 7.49 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +8.79 ms +- 201 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ In [41]: %timeit pd.eval("df1 + df2 + df3 + df4", engine="python")
│ │ │ │ │ -35.6 ms +- 4.31 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +11.4 ms +- 75.5 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ ******** TThhee _DD_aa_tt_aa_FF_rr_aa_mm_ee_.._ee_vv_aa_ll_((_)) mmeetthhoodd_?¶ ********
│ │ │ │ │ In addition to the top level _p_a_n_d_a_s_._e_v_a_l_(_) function you can also evaluate an
│ │ │ │ │ expression in the “context” of a _D_a_t_a_F_r_a_m_e.
│ │ │ │ │ In [42]: df = pd.DataFrame(np.random.randn(5, 2), columns=["a", "b"])
│ │ │ │ │
│ │ │ │ │ In [43]: df.eval("a + b")
│ │ │ │ │ Out[43]:
│ │ │ │ │ @@ -719,29 +719,29 @@
│ │ │ │ │ _p_a_n_d_a_s_._e_v_a_l_(_) works well with expressions containing large arrays.
│ │ │ │ │ In [58]: nrows, ncols = 20000, 100
│ │ │ │ │
│ │ │ │ │ In [59]: df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for
│ │ │ │ │ _ in range(4)]
│ │ │ │ │ _D_a_t_a_F_r_a_m_e arithmetic:
│ │ │ │ │ In [60]: %timeit df1 + df2 + df3 + df4
│ │ │ │ │ -38.5 ms +- 4.88 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +8.96 ms +- 77.9 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ In [61]: %timeit pd.eval("df1 + df2 + df3 + df4")
│ │ │ │ │ -13.2 ms +- 578 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ +6.77 ms +- 36.1 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ _D_a_t_a_F_r_a_m_e comparison:
│ │ │ │ │ In [62]: %timeit (df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)
│ │ │ │ │ -41.5 ms +- 2 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +11 ms +- 165 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ In [63]: %timeit pd.eval("(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)")
│ │ │ │ │ -17.1 ms +- 749 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ +11 ms +- 255 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ _D_a_t_a_F_r_a_m_e arithmetic with unaligned axes.
│ │ │ │ │ In [64]: s = pd.Series(np.random.randn(50))
│ │ │ │ │
│ │ │ │ │ In [65]: %timeit df1 + df2 + df3 + df4 + s
│ │ │ │ │ -59.8 ms +- 7.17 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +20.3 ms +- 129 us per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ In [66]: %timeit pd.eval("df1 + df2 + df3 + df4 + s")
│ │ │ │ │ -13.2 ms +- 433 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ +8.86 ms +- 81 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ Note
│ │ │ │ │ Operations such as
│ │ │ │ │ 1 and 2 # would parse to 1 & 2, but should evaluate to 2
│ │ │ │ │ 3 or 4 # would parse to 3 | 4, but should evaluate to 3
│ │ │ │ │ ~1 # this is okay, but slower when using eval
│ │ │ │ │ should be performed in Python. An exception will be raised if you try to
│ │ │ │ │ perform any boolean/bitwise operations with scalar operands that are not of
│ │ │ ├── ./usr/share/doc/python-pandas-doc/html/user_guide/scale.html
│ │ │ │ @@ -916,16 +916,16 @@
│ │ │ │ ....: files = pathlib.Path("data/timeseries/").glob("ts*.parquet")
│ │ │ │ ....: counts = pd.Series(dtype=int)
│ │ │ │ ....: for path in files:
│ │ │ │ ....: df = pd.read_parquet(path)
│ │ │ │ ....: counts = counts.add(df["name"].value_counts(), fill_value=0)
│ │ │ │ ....: counts.astype(int)
│ │ │ │ ....:
│ │ │ │ -CPU times: user 617 us, sys: 39 us, total: 656 us
│ │ │ │ -Wall time: 679 us
│ │ │ │ +CPU times: user 715 us, sys: 0 ns, total: 715 us
│ │ │ │ +Wall time: 732 us
│ │ │ │ Out[32]: Series([], dtype: int64)
│ │ │ │
Some readers, like pandas.read_csv()
, offer parameters to control the
│ │ │ │ chunksize
when reading a single file.
Manually chunking is an OK option for workflows that don’t
│ │ │ │ require too sophisticated of operations. Some operations, like pandas.DataFrame.groupby()
, are
│ │ │ │ ├── html2text {}
│ │ │ │ │ @@ -633,16 +633,16 @@
│ │ │ │ │ ....: files = pathlib.Path("data/timeseries/").glob("ts*.parquet")
│ │ │ │ │ ....: counts = pd.Series(dtype=int)
│ │ │ │ │ ....: for path in files:
│ │ │ │ │ ....: df = pd.read_parquet(path)
│ │ │ │ │ ....: counts = counts.add(df["name"].value_counts(), fill_value=0)
│ │ │ │ │ ....: counts.astype(int)
│ │ │ │ │ ....:
│ │ │ │ │ -CPU times: user 617 us, sys: 39 us, total: 656 us
│ │ │ │ │ -Wall time: 679 us
│ │ │ │ │ +CPU times: user 715 us, sys: 0 ns, total: 715 us
│ │ │ │ │ +Wall time: 732 us
│ │ │ │ │ Out[32]: Series([], dtype: int64)
│ │ │ │ │ Some readers, like _p_a_n_d_a_s_._r_e_a_d___c_s_v_(_), offer parameters to control the chunksize
│ │ │ │ │ when reading a single file.
│ │ │ │ │ Manually chunking is an OK option for workflows that don’t require too
│ │ │ │ │ sophisticated of operations. Some operations, like _p_a_n_d_a_s_._D_a_t_a_F_r_a_m_e_._g_r_o_u_p_b_y_(_),
│ │ │ │ │ are much harder to do chunkwise. In these cases, you may be better switching to
│ │ │ │ │ a different library that implements these out-of-core algorithms for you.
│ │ │ ├── ./usr/share/doc/python-pandas-doc/html/user_guide/style.ipynb.gz
│ │ │ │ ├── style.ipynb
│ │ │ │ │ ├── Pretty-printed
│ │ │ │ │ │┄ Similarity: 0.9985610875706213%
│ │ │ │ │ │┄ Differences: {"'cells'": "{1: {'metadata': {'execution': {'iopub.execute_input': '2024-09-09T16:40:59.567542Z', "
│ │ │ │ │ │┄ "'iopub.status.busy': '2024-09-09T16:40:59.566415Z', 'iopub.status.idle': "
│ │ │ │ │ │┄ "'2024-09-09T16:41:00.084217Z', 'shell.execute_reply': "
│ │ │ │ │ │┄ "'2024-09-09T16:41:00.083249Z'}}}, 3: {'metadata': {'execution': "
│ │ │ │ │ │┄ "{'iopub.execute_input': '2024-09-09T16:41:00.089505Z', 'iopub.status.busy': "
│ │ │ │ │ │┄ "'2024-09-09T16:41:00.088820Z', 'iopub.status.idle': '2024-09-09T16:41:0 […]
│ │ │ │ │ │ @@ -39,18 +39,18 @@
│ │ │ │ │ │ ]
│ │ │ │ │ │ },
│ │ │ │ │ │ {
│ │ │ │ │ │ "cell_type": "code",
│ │ │ │ │ │ "execution_count": 1,
│ │ │ │ │ │ "metadata": {
│ │ │ │ │ │ "execution": {
│ │ │ │ │ │ - "iopub.execute_input": "2025-10-13T00:15:39.786466Z",
│ │ │ │ │ │ - "iopub.status.busy": "2025-10-13T00:15:39.786192Z",
│ │ │ │ │ │ - "iopub.status.idle": "2025-10-13T00:15:40.326133Z",
│ │ │ │ │ │ - "shell.execute_reply": "2025-10-13T00:15:40.325474Z"
│ │ │ │ │ │ + "iopub.execute_input": "2024-09-09T16:40:59.567542Z",
│ │ │ │ │ │ + "iopub.status.busy": "2024-09-09T16:40:59.566415Z",
│ │ │ │ │ │ + "iopub.status.idle": "2024-09-09T16:41:00.084217Z",
│ │ │ │ │ │ + "shell.execute_reply": "2024-09-09T16:41:00.083249Z"
│ │ │ │ │ │ },
│ │ │ │ │ │ "nbsphinx": "hidden"
│ │ │ │ │ │ },
│ │ │ │ │ │ "outputs": [],
│ │ │ │ │ │ "source": [
│ │ │ │ │ │ "import matplotlib.pyplot\n",
│ │ │ │ │ │ "# We have this here to trigger matplotlib's font cache stuff.\n",
│ │ │ │ │ │ @@ -77,36 +77,36 @@
│ │ │ │ │ │ ]
│ │ │ │ │ │ },
│ │ │ │ │ │ {
│ │ │ │ │ │ "cell_type": "code",
│ │ │ │ │ │ "execution_count": 2,
│ │ │ │ │ │ "metadata": {
│ │ │ │ │ │ "execution": {
│ │ │ │ │ │ - "iopub.execute_input": "2025-10-13T00:15:40.330290Z",
│ │ │ │ │ │ - "iopub.status.busy": "2025-10-13T00:15:40.329614Z",
│ │ │ │ │ │ - "iopub.status.idle": "2025-10-13T00:15:40.652895Z",
│ │ │ │ │ │ - "shell.execute_reply": "2025-10-13T00:15:40.652131Z"
│ │ │ │ │ │ + "iopub.execute_input": "2024-09-09T16:41:00.089505Z",
│ │ │ │ │ │ + "iopub.status.busy": "2024-09-09T16:41:00.088820Z",
│ │ │ │ │ │ + "iopub.status.idle": "2024-09-09T16:41:00.348911Z",
│ │ │ │ │ │ + "shell.execute_reply": "2024-09-09T16:41:00.347940Z"
│ │ │ │ │ │ }
│ │ │ │ │ │ },
│ │ │ │ │ │ "outputs": [],
│ │ │ │ │ │ "source": [
│ │ │ │ │ │ "import pandas as pd\n",
│ │ │ │ │ │ "import numpy as np\n",
│ │ │ │ │ │ "import matplotlib as mpl\n"
│ │ │ │ │ │ ]
│ │ │ │ │ │ },
│ │ │ │ │ │ {
│ │ │ │ │ │ "cell_type": "code",
│ │ │ │ │ │ "execution_count": 3,
│ │ │ │ │ │ "metadata": {
│ │ │ │ │ │ "execution": {
│ │ │ │ │ │ - "iopub.execute_input": "2025-10-13T00:15:40.656793Z",
│ │ │ │ │ │ - "iopub.status.busy": "2025-10-13T00:15:40.656423Z",
│ │ │ │ │ │ - "iopub.status.idle": "2025-10-13T00:15:40.717112Z",
│ │ │ │ │ │ - "shell.execute_reply": "2025-10-13T00:15:40.716387Z"
│ │ │ │ │ │ + "iopub.execute_input": "2024-09-09T16:41:00.353625Z",
│ │ │ │ │ │ + "iopub.status.busy": "2024-09-09T16:41:00.353229Z",
│ │ │ │ │ │ + "iopub.status.idle": "2024-09-09T16:41:00.413466Z",
│ │ │ │ │ │ + "shell.execute_reply": "2024-09-09T16:41:00.412533Z"
│ │ │ │ │ │ },
│ │ │ │ │ │ "nbsphinx": "hidden"
│ │ │ │ │ │ },
│ │ │ │ │ │ "outputs": [],
│ │ │ │ │ │ "source": [
│ │ │ │ │ │ "# For reproducibility - this doesn't respect uuid_len or positionally-passed uuid but the places here that use that coincidentally bypass this anyway\n",
│ │ │ │ │ │ "from pandas.io.formats.style import Styler\n",
│ │ │ │ │ │ @@ -123,18 +123,18 @@
│ │ │ │ │ │ ]
│ │ │ │ │ │ },
│ │ │ │ │ │ {
│ │ │ │ │ │ "cell_type": "code",
│ │ │ │ │ │ "execution_count": 4,
│ │ │ │ │ │ "metadata": {
│ │ │ │ │ │ "execution": {
│ │ │ │ │ │ - "iopub.execute_input": "2025-10-13T00:15:40.724580Z",
│ │ │ │ │ │ - "iopub.status.busy": "2025-10-13T00:15:40.724197Z",
│ │ │ │ │ │ - "iopub.status.idle": "2025-10-13T00:15:40.739478Z",
│ │ │ │ │ │ - "shell.execute_reply": "2025-10-13T00:15:40.738831Z"
│ │ │ │ │ │ + "iopub.execute_input": "2024-09-09T16:41:00.417808Z",
│ │ │ │ │ │ + "iopub.status.busy": "2024-09-09T16:41:00.417414Z",
│ │ │ │ │ │ + "iopub.status.idle": "2024-09-09T16:41:00.429050Z",
│ │ │ │ │ │ + "shell.execute_reply": "2024-09-09T16:41:00.428150Z"
│ │ │ │ │ │ }
│ │ │ │ │ │ },
│ │ │ │ │ │ "outputs": [
│ │ │ │ │ │ {
│ │ │ │ │ │ "data": {
│ │ │ │ │ │ "text/html": [
│ │ │ │ │ │ "