--- /srv/reproducible-results/rbuild-debian/r-b-build.6brRH3xl/b1/pandas_2.2.2+dfsg-4_arm64.changes +++ /srv/reproducible-results/rbuild-debian/r-b-build.6brRH3xl/b2/pandas_2.2.2+dfsg-4_arm64.changes ├── Files │ @@ -1,5 +1,5 @@ │ │ - 2c7a4bf14a4a26e13710ad68a9482842 10491700 doc optional python-pandas-doc_2.2.2+dfsg-4_all.deb │ - 63b97a9f7242be30dfe1cef1a6e57cea 35712496 debug optional python3-pandas-lib-dbgsym_2.2.2+dfsg-4_arm64.deb │ - cb7f29e1b171ccb1c4b2a3315bbb57f6 3727912 python optional python3-pandas-lib_2.2.2+dfsg-4_arm64.deb │ + 099d941583f57fcbb1cc1a602fc305c6 10492516 doc optional python-pandas-doc_2.2.2+dfsg-4_all.deb │ + 870243fde85c0731fcff25659cea3590 35712356 debug optional python3-pandas-lib-dbgsym_2.2.2+dfsg-4_arm64.deb │ + 4da540ede26207b6fce95c4ab7e503e6 3727556 python optional python3-pandas-lib_2.2.2+dfsg-4_arm64.deb │ cd2f98205ce377f7af776b3e5c9564dd 3095732 python optional python3-pandas_2.2.2+dfsg-4_all.deb ├── python-pandas-doc_2.2.2+dfsg-4_all.deb │ ├── file list │ │ @@ -1,3 +1,3 @@ │ │ -rw-r--r-- 0 0 0 4 2024-07-07 18:36:37.000000 debian-binary │ │ --rw-r--r-- 0 0 0 147520 2024-07-07 18:36:37.000000 control.tar.xz │ │ --rw-r--r-- 0 0 0 10343988 2024-07-07 18:36:37.000000 data.tar.xz │ │ +-rw-r--r-- 0 0 0 147580 2024-07-07 18:36:37.000000 control.tar.xz │ │ +-rw-r--r-- 0 0 0 10344744 2024-07-07 18:36:37.000000 data.tar.xz │ ├── control.tar.xz │ │ ├── control.tar │ │ │ ├── ./control │ │ │ │ @@ -1,13 +1,13 @@ │ │ │ │ Package: python-pandas-doc │ │ │ │ Source: pandas │ │ │ │ Version: 2.2.2+dfsg-4 │ │ │ │ Architecture: all │ │ │ │ Maintainer: Debian Science Team │ │ │ │ -Installed-Size: 197198 │ │ │ │ +Installed-Size: 197196 │ │ │ │ Depends: libjs-sphinxdoc (>= 7.4), libjs-mathjax │ │ │ │ Suggests: python3-pandas │ │ │ │ Section: doc │ │ │ │ Priority: optional │ │ │ │ Multi-Arch: foreign │ │ │ │ Homepage: https://pandas.pydata.org/ │ │ │ │ Description: data structures for "relational" or "labeled" data - documentation │ │ │ ├── ./md5sums │ │ │ │ ├── ./md5sums │ │ │ │ │┄ Files differ │ ├── data.tar.xz │ │ ├── data.tar │ │ │ ├── file list │ │ │ │ @@ -5498,17 +5498,17 @@ │ │ │ │ -rw-r--r-- 0 root (0) root (0) 22454 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.set_uuid.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 18990 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.template_html.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 19084 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.template_html_style.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 19088 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.template_html_table.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 19047 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.template_latex.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 19000 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.template_string.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 34222 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.text_gradient.html │ │ │ │ --rw-r--r-- 0 root (0) root (0) 33392 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.to_excel.html │ │ │ │ --rw-r--r-- 0 root (0) root (0) 29803 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.to_html.html │ │ │ │ --rw-r--r-- 0 root (0) root (0) 75794 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.to_latex.html │ │ │ │ +-rw-r--r-- 0 root (0) root (0) 33073 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.to_excel.html │ │ │ │ +-rw-r--r-- 0 root (0) root (0) 29484 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.to_html.html │ │ │ │ +-rw-r--r-- 0 root (0) root (0) 75475 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.to_latex.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 24564 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.to_string.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 23142 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.use.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 24197 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.json.build_table_schema.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 254 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.json.json_normalize.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 21877 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.stata.StataReader.data_label.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 22877 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.stata.StataReader.value_labels.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 22908 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.stata.StataReader.variable_labels.html │ │ │ │ @@ -6255,61 +6255,61 @@ │ │ │ │ -rw-r--r-- 0 root (0) root (0) 206416 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/series.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 38771 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/style.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 38909 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/testing.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 43513 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reference/window.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 244 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/release.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 269 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/reshaping.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 7370 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/search.html │ │ │ │ --rw-r--r-- 0 root (0) root (0) 2357410 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/searchindex.js │ │ │ │ +-rw-r--r-- 0 root (0) root (0) 2357415 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/searchindex.js │ │ │ │ -rw-r--r-- 0 root (0) root (0) 259 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/sparse.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 244 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/style.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 255 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/text.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 256 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/timedeltas.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 277 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/timeseries.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 272 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/tutorials.html │ │ │ │ drwxr-xr-x 0 root (0) root (0) 0 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/ │ │ │ │ -rw-r--r-- 0 root (0) root (0) 161368 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/10min.html │ │ │ │ --rw-r--r-- 0 root (0) root (0) 274172 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/advanced.html │ │ │ │ +-rw-r--r-- 0 root (0) root (0) 274174 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/advanced.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 425346 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/basics.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 26009 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/boolean.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 207638 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/categorical.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 7710 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/cookbook.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 55618 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/copy_on_write.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 150182 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/dsintro.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 70826 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/duplicates.html │ │ │ │ --rw-r--r-- 0 root (0) root (0) 105332 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/enhancingperf.html │ │ │ │ +-rw-r--r-- 0 root (0) root (0) 105339 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/enhancingperf.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 97529 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/gotchas.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 276317 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/groupby.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 49266 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/index.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 382548 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/indexing.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 31200 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/integer_na.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 1131712 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/io.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 198947 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/merging.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 168461 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/missing_data.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 101657 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/options.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 136945 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/pyarrow.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 152672 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/reshaping.html │ │ │ │ --rw-r--r-- 0 root (0) root (0) 105027 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/scale.html │ │ │ │ +-rw-r--r-- 0 root (0) root (0) 105026 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/scale.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 55141 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/sparse.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 688567 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/style.html │ │ │ │ --rw-r--r-- 0 root (0) root (0) 87891 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/style.ipynb.gz │ │ │ │ +-rw-r--r-- 0 root (0) root (0) 87837 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/style.ipynb.gz │ │ │ │ -rw-r--r-- 0 root (0) root (0) 133802 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/text.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 90572 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/timedeltas.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 477039 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/timeseries.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 192397 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/visualization.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 131481 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/user_guide/window.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 270 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/visualization.html │ │ │ │ drwxr-xr-x 0 root (0) root (0) 0 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/ │ │ │ │ -rw-r--r-- 0 root (0) root (0) 97789 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/index.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 8695 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/index.html.gz │ │ │ │ -rw-r--r-- 0 root (0) root (0) 74235 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.10.0.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 56699 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.10.1.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 72591 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.11.0.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 94554 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.12.0.html │ │ │ │ --rw-r--r-- 0 root (0) root (0) 212797 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.13.0.html │ │ │ │ +-rw-r--r-- 0 root (0) root (0) 212799 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.13.0.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 79662 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.13.1.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 234094 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.14.0.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 73500 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.14.1.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 242800 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.15.0.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 58487 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.15.1.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 65335 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.15.2.html │ │ │ │ -rw-r--r-- 0 root (0) root (0) 135619 2024-07-07 18:36:37.000000 ./usr/share/doc/python-pandas-doc/html/whatsnew/v0.16.0.html │ │ │ ├── ./usr/share/doc/python-pandas-doc/html/reference/api/pandas.io.formats.style.Styler.to_excel.html │ │ │ │ @@ -152,352 +152,287 @@ │ │ │ │
│ │ │ │
In [9]: %timeit df.apply(lambda x: integrate_f_plain(x["a"], x["b"], x["N"]), axis=1)
│ │ │ │ -170 ms +- 23.5 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +72.6 ms +- 56.6 us per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │

This has improved the performance compared to the pure Python approach by one-third.

│ │ │ │ │ │ │ │
│ │ │ │

Declaring C types

│ │ │ │

We can annotate the function variables and return types as well as use cdef │ │ │ │ @@ -595,34 +595,34 @@ │ │ │ │ ....: for i in range(N): │ │ │ │ ....: s += f_typed(a + i * dx) │ │ │ │ ....: return s * dx │ │ │ │ ....: │ │ │ │ │ │ │ │ │ │ │ │

In [11]: %timeit df.apply(lambda x: integrate_f_typed(x["a"], x["b"], x["N"]), axis=1)
│ │ │ │ -18 ms +- 2.14 ms per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ +8.91 ms +- 10.5 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │

Annotating the functions with C types yields an over ten times performance improvement compared to │ │ │ │ the original Python implementation.

│ │ │ │
│ │ │ │
│ │ │ │

Using ndarray

│ │ │ │

When re-profiling, time is spent creating a Series from each row, and calling __getitem__ from both │ │ │ │ the index and the series (three times for each row). These Python function calls are expensive and │ │ │ │ can be improved by passing an np.ndarray.

│ │ │ │
In [12]: %prun -l 4 df.apply(lambda x: integrate_f_typed(x["a"], x["b"], x["N"]), axis=1)
│ │ │ │ -         52528 function calls (52510 primitive calls) in 0.024 seconds
│ │ │ │ +         52528 function calls (52510 primitive calls) in 0.023 seconds
│ │ │ │  
│ │ │ │     Ordered by: internal time
│ │ │ │     List reduced from 157 to 4 due to restriction <4>
│ │ │ │  
│ │ │ │     ncalls  tottime  percall  cumtime  percall filename:lineno(function)
│ │ │ │ -     3000    0.004    0.000    0.015    0.000 series.py:1095(__getitem__)
│ │ │ │ -     3000    0.003    0.000    0.007    0.000 series.py:1220(_get_value)
│ │ │ │ +     3000    0.004    0.000    0.014    0.000 series.py:1095(__getitem__)
│ │ │ │ +     3000    0.003    0.000    0.006    0.000 series.py:1220(_get_value)
│ │ │ │      16098    0.002    0.000    0.003    0.000 {built-in method builtins.isinstance}
│ │ │ │       3000    0.002    0.000    0.002    0.000 base.py:3777(get_loc)
│ │ │ │  
│ │ │ │
│ │ │ │
In [13]: %%cython
│ │ │ │     ....: cimport numpy as np
│ │ │ │     ....: import numpy as np
│ │ │ │ @@ -659,15 +659,15 @@
│ │ │ │  
│ │ │ │

This implementation creates an array of zeros and inserts the result │ │ │ │ of integrate_f_typed applied over each row. Looping over an ndarray is faster │ │ │ │ in Cython than looping over a Series object.

│ │ │ │

Since apply_integrate_f is typed to accept an np.ndarray, Series.to_numpy() │ │ │ │ calls are needed to utilize this function.

│ │ │ │
In [14]: %timeit apply_integrate_f(df["a"].to_numpy(), df["b"].to_numpy(), df["N"].to_numpy())
│ │ │ │ -1.77 ms +- 287 us per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ +1.17 ms +- 452 ns per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │

Performance has improved from the prior implementation by almost ten times.

│ │ │ │
│ │ │ │
│ │ │ │

Disabling compiler directives

│ │ │ │

The majority of the time is now spent in apply_integrate_f. Disabling Cython’s boundscheck │ │ │ │ @@ -676,16 +676,16 @@ │ │ │ │ 78 function calls in 0.001 seconds │ │ │ │ │ │ │ │ Ordered by: internal time │ │ │ │ List reduced from 21 to 4 due to restriction <4> │ │ │ │ │ │ │ │ ncalls tottime percall cumtime percall filename:lineno(function) │ │ │ │ 1 0.001 0.001 0.001 0.001 <string>:1(<module>) │ │ │ │ - 1 0.000 0.000 0.001 0.001 {built-in method builtins.exec} │ │ │ │ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} │ │ │ │ + 1 0.000 0.000 0.001 0.001 {built-in method builtins.exec} │ │ │ │ 3 0.000 0.000 0.000 0.000 frame.py:4062(__getitem__) │ │ │ │ │ │ │ │ │ │ │ │

In [16]: %%cython
│ │ │ │     ....: cimport cython
│ │ │ │     ....: cimport numpy as np
│ │ │ │     ....: import numpy as np
│ │ │ │ @@ -719,15 +719,15 @@
│ │ │ │                   from /build/reproducible-path/pandas-2.2.2+dfsg/buildtmp/.cache/ipython/cython/_cython_magic_6e544448a5b49fdd9edaaa8b35e916ee98fab35c.c:1251:
│ │ │ │  /usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
│ │ │ │     17 | #warning "Using deprecated NumPy API, disable it with " \
│ │ │ │        |  ^~~~~~~
│ │ │ │  
│ │ │ │
│ │ │ │
In [17]: %timeit apply_integrate_f_wrap(df["a"].to_numpy(), df["b"].to_numpy(), df["N"].to_numpy())
│ │ │ │ -1.4 ms +- 208 us per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ +815 us +- 254 ns per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │

However, a loop indexer i accessing an invalid location in an array would cause a segfault because memory access isn’t checked. │ │ │ │ For more about boundscheck and wraparound, see the Cython docs on │ │ │ │ compiler directives.

│ │ │ │
│ │ │ │ │ │ │ │ @@ -1085,19 +1085,19 @@ │ │ │ │ compared to standard Python syntax for large DataFrame. This engine requires the │ │ │ │ optional dependency numexpr to be installed.

│ │ │ │

The 'python' engine is generally not useful except for testing │ │ │ │ other evaluation engines against it. You will achieve no performance │ │ │ │ benefits using eval() with engine='python' and may │ │ │ │ incur a performance hit.

│ │ │ │
In [40]: %timeit df1 + df2 + df3 + df4
│ │ │ │ -35 ms +- 7.49 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +8.79 ms +- 201 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │
In [41]: %timeit pd.eval("df1 + df2 + df3 + df4", engine="python")
│ │ │ │ -35.6 ms +- 4.31 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +11.4 ms +- 75.5 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │ │ │ │ │
│ │ │ │

The DataFrame.eval() method

│ │ │ │

In addition to the top level pandas.eval() function you can also │ │ │ │ evaluate an expression in the “context” of a DataFrame.

│ │ │ │ @@ -1212,39 +1212,39 @@ │ │ │ │
In [58]: nrows, ncols = 20000, 100
│ │ │ │  
│ │ │ │  In [59]: df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for _ in range(4)]
│ │ │ │  
│ │ │ │
│ │ │ │

DataFrame arithmetic:

│ │ │ │
In [60]: %timeit df1 + df2 + df3 + df4
│ │ │ │ -38.5 ms +- 4.88 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +8.96 ms +- 77.9 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │
In [61]: %timeit pd.eval("df1 + df2 + df3 + df4")
│ │ │ │ -13.2 ms +- 578 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ +6.77 ms +- 36.1 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │

DataFrame comparison:

│ │ │ │
In [62]: %timeit (df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)
│ │ │ │ -41.5 ms +- 2 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +11 ms +- 165 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │
In [63]: %timeit pd.eval("(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)")
│ │ │ │ -17.1 ms +- 749 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ +11 ms +- 255 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │

DataFrame arithmetic with unaligned axes.

│ │ │ │
In [64]: s = pd.Series(np.random.randn(50))
│ │ │ │  
│ │ │ │  In [65]: %timeit df1 + df2 + df3 + df4 + s
│ │ │ │ -59.8 ms +- 7.17 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ +20.3 ms +- 129 us per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │
In [66]: %timeit pd.eval("df1 + df2 + df3 + df4 + s")
│ │ │ │ -13.2 ms +- 433 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ +8.86 ms +- 81 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │  
│ │ │ │
│ │ │ │
│ │ │ │

Note

│ │ │ │

Operations such as

│ │ │ │
1 and 2  # would parse to 1 & 2, but should evaluate to 2
│ │ │ │  3 or 4  # would parse to 3 | 4, but should evaluate to 3
│ │ │ │ ├── html2text {}
│ │ │ │ │ @@ -113,32 +113,32 @@
│ │ │ │ │     ...:     dx = (b - a) / N
│ │ │ │ │     ...:     for i in range(N):
│ │ │ │ │     ...:         s += f(a + i * dx)
│ │ │ │ │     ...:     return s * dx
│ │ │ │ │     ...:
│ │ │ │ │  We achieve our result by using _D_a_t_a_F_r_a_m_e_._a_p_p_l_y_(_) (row-wise):
│ │ │ │ │  In [5]: %timeit df.apply(lambda x: integrate_f(x["a"], x["b"], x["N"]), axis=1)
│ │ │ │ │ -141 ms +- 35 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +79.1 ms +- 4.2 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │  Let’s take a look and see where the time is spent during this operation using
│ │ │ │ │  the _p_r_u_n_ _i_p_y_t_h_o_n_ _m_a_g_i_c_ _f_u_n_c_t_i_o_n:
│ │ │ │ │  # most time consuming 4 calls
│ │ │ │ │  In [6]: %prun -l 4 df.apply(lambda x: integrate_f(x["a"], x["b"], x["N"]),
│ │ │ │ │  axis=1)  # noqa E999
│ │ │ │ │ -         605951 function calls (605933 primitive calls) in 0.350 seconds
│ │ │ │ │ +         605951 function calls (605933 primitive calls) in 0.189 seconds
│ │ │ │ │  
│ │ │ │ │     Ordered by: internal time
│ │ │ │ │     List reduced from 159 to 4 due to restriction <4>
│ │ │ │ │  
│ │ │ │ │     ncalls  tottime  percall  cumtime  percall filename:lineno(function)
│ │ │ │ │ -     1000    0.201    0.000    0.299    0.000 :1
│ │ │ │ │ +     1000    0.112    0.000    0.166    0.000 :1
│ │ │ │ │  (integrate_f)
│ │ │ │ │ -   552423    0.098    0.000    0.098    0.000 :1
│ │ │ │ │ +   552423    0.054    0.000    0.054    0.000 :1
│ │ │ │ │  (f)
│ │ │ │ │ -     3000    0.008    0.000    0.032    0.000 series.py:1095(__getitem__)
│ │ │ │ │ -     3000    0.007    0.000    0.016    0.000 series.py:1220(_get_value)
│ │ │ │ │ +     3000    0.004    0.000    0.015    0.000 series.py:1095(__getitem__)
│ │ │ │ │ +     3000    0.003    0.000    0.007    0.000 series.py:1220(_get_value)
│ │ │ │ │  By far the majority of time is spend inside either integrate_f or f, hence
│ │ │ │ │  we’ll concentrate our efforts cythonizing these two functions.
│ │ │ │ │  ******** PPllaaiinn CCyytthhoonn_?¶ ********
│ │ │ │ │  First we’re going to need to import the Cython magic function to IPython:
│ │ │ │ │  In [7]: %load_ext Cython
│ │ │ │ │  Now, let’s simply copy our functions over to Cython:
│ │ │ │ │  In [8]: %%cython
│ │ │ │ │ @@ -149,15 +149,15 @@
│ │ │ │ │     ...:     dx = (b - a) / N
│ │ │ │ │     ...:     for i in range(N):
│ │ │ │ │     ...:         s += f_plain(a + i * dx)
│ │ │ │ │     ...:     return s * dx
│ │ │ │ │     ...:
│ │ │ │ │  In [9]: %timeit df.apply(lambda x: integrate_f_plain(x["a"], x["b"], x["N"]),
│ │ │ │ │  axis=1)
│ │ │ │ │ -170 ms +- 23.5 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +72.6 ms +- 56.6 us per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │  This has improved the performance compared to the pure Python approach by one-
│ │ │ │ │  third.
│ │ │ │ │  ******** DDeeccllaarriinngg CC ttyyppeess_?¶ ********
│ │ │ │ │  We can annotate the function variables and return types as well as use cdef and
│ │ │ │ │  cpdef to improve performance:
│ │ │ │ │  In [10]: %%cython
│ │ │ │ │     ....: cdef double f_typed(double x) except? -2:
│ │ │ │ │ @@ -169,32 +169,32 @@
│ │ │ │ │     ....:     dx = (b - a) / N
│ │ │ │ │     ....:     for i in range(N):
│ │ │ │ │     ....:         s += f_typed(a + i * dx)
│ │ │ │ │     ....:     return s * dx
│ │ │ │ │     ....:
│ │ │ │ │  In [11]: %timeit df.apply(lambda x: integrate_f_typed(x["a"], x["b"], x["N"]),
│ │ │ │ │  axis=1)
│ │ │ │ │ -18 ms +- 2.14 ms per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ +8.91 ms +- 10.5 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │  Annotating the functions with C types yields an over ten times performance
│ │ │ │ │  improvement compared to the original Python implementation.
│ │ │ │ │  ******** UUssiinngg nnddaarrrraayy_?¶ ********
│ │ │ │ │  When re-profiling, time is spent creating a _S_e_r_i_e_s from each row, and calling
│ │ │ │ │  __getitem__ from both the index and the series (three times for each row).
│ │ │ │ │  These Python function calls are expensive and can be improved by passing an
│ │ │ │ │  np.ndarray.
│ │ │ │ │  In [12]: %prun -l 4 df.apply(lambda x: integrate_f_typed(x["a"], x["b"], x
│ │ │ │ │  ["N"]), axis=1)
│ │ │ │ │ -         52528 function calls (52510 primitive calls) in 0.024 seconds
│ │ │ │ │ +         52528 function calls (52510 primitive calls) in 0.023 seconds
│ │ │ │ │  
│ │ │ │ │     Ordered by: internal time
│ │ │ │ │     List reduced from 157 to 4 due to restriction <4>
│ │ │ │ │  
│ │ │ │ │     ncalls  tottime  percall  cumtime  percall filename:lineno(function)
│ │ │ │ │ -     3000    0.004    0.000    0.015    0.000 series.py:1095(__getitem__)
│ │ │ │ │ -     3000    0.003    0.000    0.007    0.000 series.py:1220(_get_value)
│ │ │ │ │ +     3000    0.004    0.000    0.014    0.000 series.py:1095(__getitem__)
│ │ │ │ │ +     3000    0.003    0.000    0.006    0.000 series.py:1220(_get_value)
│ │ │ │ │      16098    0.002    0.000    0.003    0.000 {built-in method
│ │ │ │ │  builtins.isinstance}
│ │ │ │ │       3000    0.002    0.000    0.002    0.000 base.py:3777(get_loc)
│ │ │ │ │  In [13]: %%cython
│ │ │ │ │     ....: cimport numpy as np
│ │ │ │ │     ....: import numpy as np
│ │ │ │ │     ....: cdef double f_typed(double x) except? -2:
│ │ │ │ │ @@ -238,31 +238,31 @@
│ │ │ │ │  This implementation creates an array of zeros and inserts the result of
│ │ │ │ │  integrate_f_typed applied over each row. Looping over an ndarray is faster in
│ │ │ │ │  Cython than looping over a _S_e_r_i_e_s object.
│ │ │ │ │  Since apply_integrate_f is typed to accept an np.ndarray, _S_e_r_i_e_s_._t_o___n_u_m_p_y_(_)
│ │ │ │ │  calls are needed to utilize this function.
│ │ │ │ │  In [14]: %timeit apply_integrate_f(df["a"].to_numpy(), df["b"].to_numpy(), df
│ │ │ │ │  ["N"].to_numpy())
│ │ │ │ │ -1.77 ms +- 287 us per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ │ +1.17 ms +- 452 ns per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ │  Performance has improved from the prior implementation by almost ten times.
│ │ │ │ │  ******** DDiissaabblliinngg ccoommppiilleerr ddiirreeccttiivveess_?¶ ********
│ │ │ │ │  The majority of the time is now spent in apply_integrate_f. Disabling Cython’s
│ │ │ │ │  boundscheck and wraparound checks can yield more performance.
│ │ │ │ │  In [15]: %prun -l 4 apply_integrate_f(df["a"].to_numpy(), df["b"].to_numpy(),
│ │ │ │ │  df["N"].to_numpy())
│ │ │ │ │           78 function calls in 0.001 seconds
│ │ │ │ │  
│ │ │ │ │     Ordered by: internal time
│ │ │ │ │     List reduced from 21 to 4 due to restriction <4>
│ │ │ │ │  
│ │ │ │ │     ncalls  tottime  percall  cumtime  percall filename:lineno(function)
│ │ │ │ │          1    0.001    0.001    0.001    0.001 :1()
│ │ │ │ │ -        1    0.000    0.000    0.001    0.001 {built-in method builtins.exec}
│ │ │ │ │          1    0.000    0.000    0.000    0.000 {method 'disable' of
│ │ │ │ │  '_lsprof.Profiler' objects}
│ │ │ │ │ +        1    0.000    0.000    0.001    0.001 {built-in method builtins.exec}
│ │ │ │ │          3    0.000    0.000    0.000    0.000 frame.py:4062(__getitem__)
│ │ │ │ │  In [16]: %%cython
│ │ │ │ │     ....: cimport cython
│ │ │ │ │     ....: cimport numpy as np
│ │ │ │ │     ....: import numpy as np
│ │ │ │ │     ....: cdef np.float64_t f_typed(np.float64_t x) except? -2:
│ │ │ │ │     ....:     return x * (x - 1)
│ │ │ │ │ @@ -301,15 +301,15 @@
│ │ │ │ │  /usr/lib/python3/dist-packages/numpy/core/include/numpy/
│ │ │ │ │  npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API,
│ │ │ │ │  disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
│ │ │ │ │     17 | #warning "Using deprecated NumPy API, disable it with " \
│ │ │ │ │        |  ^~~~~~~
│ │ │ │ │  In [17]: %timeit apply_integrate_f_wrap(df["a"].to_numpy(), df["b"].to_numpy(),
│ │ │ │ │  df["N"].to_numpy())
│ │ │ │ │ -1.4 ms +- 208 us per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ │ +815 us +- 254 ns per loop (mean +- std. dev. of 7 runs, 1,000 loops each)
│ │ │ │ │  However, a loop indexer i accessing an invalid location in an array would cause
│ │ │ │ │  a segfault because memory access isn’t checked. For more about boundscheck and
│ │ │ │ │  wraparound, see the Cython docs on _c_o_m_p_i_l_e_r_ _d_i_r_e_c_t_i_v_e_s.
│ │ │ │ │  ********** NNuummbbaa ((JJIITT ccoommppiillaattiioonn))_?¶ **********
│ │ │ │ │  An alternative to statically compiling Cython code is to use a dynamic just-in-
│ │ │ │ │  time (JIT) compiler with _N_u_m_b_a.
│ │ │ │ │  Numba allows you to write a pure Python function which can be JIT compiled to
│ │ │ │ │ @@ -612,17 +612,17 @@
│ │ │ │ │  The 'numexpr' engine is the more performant engine that can yield performance
│ │ │ │ │  improvements compared to standard Python syntax for large _D_a_t_a_F_r_a_m_e. This
│ │ │ │ │  engine requires the optional dependency numexpr to be installed.
│ │ │ │ │  The 'python' engine is generally nnoott useful except for testing other evaluation
│ │ │ │ │  engines against it. You will achieve nnoo performance benefits using _e_v_a_l_(_) with
│ │ │ │ │  engine='python' and may incur a performance hit.
│ │ │ │ │  In [40]: %timeit df1 + df2 + df3 + df4
│ │ │ │ │ -35 ms +- 7.49 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +8.79 ms +- 201 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │  In [41]: %timeit pd.eval("df1 + df2 + df3 + df4", engine="python")
│ │ │ │ │ -35.6 ms +- 4.31 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +11.4 ms +- 75.5 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │  ******** TThhee _DD_aa_tt_aa_FF_rr_aa_mm_ee_.._ee_vv_aa_ll_((_)) mmeetthhoodd_?¶ ********
│ │ │ │ │  In addition to the top level _p_a_n_d_a_s_._e_v_a_l_(_) function you can also evaluate an
│ │ │ │ │  expression in the “context” of a _D_a_t_a_F_r_a_m_e.
│ │ │ │ │  In [42]: df = pd.DataFrame(np.random.randn(5, 2), columns=["a", "b"])
│ │ │ │ │  
│ │ │ │ │  In [43]: df.eval("a + b")
│ │ │ │ │  Out[43]:
│ │ │ │ │ @@ -719,29 +719,29 @@
│ │ │ │ │  _p_a_n_d_a_s_._e_v_a_l_(_) works well with expressions containing large arrays.
│ │ │ │ │  In [58]: nrows, ncols = 20000, 100
│ │ │ │ │  
│ │ │ │ │  In [59]: df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for
│ │ │ │ │  _ in range(4)]
│ │ │ │ │  _D_a_t_a_F_r_a_m_e arithmetic:
│ │ │ │ │  In [60]: %timeit df1 + df2 + df3 + df4
│ │ │ │ │ -38.5 ms +- 4.88 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +8.96 ms +- 77.9 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │  In [61]: %timeit pd.eval("df1 + df2 + df3 + df4")
│ │ │ │ │ -13.2 ms +- 578 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ +6.77 ms +- 36.1 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │  _D_a_t_a_F_r_a_m_e comparison:
│ │ │ │ │  In [62]: %timeit (df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)
│ │ │ │ │ -41.5 ms +- 2 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +11 ms +- 165 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │  In [63]: %timeit pd.eval("(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)")
│ │ │ │ │ -17.1 ms +- 749 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ +11 ms +- 255 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │  _D_a_t_a_F_r_a_m_e arithmetic with unaligned axes.
│ │ │ │ │  In [64]: s = pd.Series(np.random.randn(50))
│ │ │ │ │  
│ │ │ │ │  In [65]: %timeit df1 + df2 + df3 + df4 + s
│ │ │ │ │ -59.8 ms +- 7.17 ms per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │ +20.3 ms +- 129 us per loop (mean +- std. dev. of 7 runs, 10 loops each)
│ │ │ │ │  In [66]: %timeit pd.eval("df1 + df2 + df3 + df4 + s")
│ │ │ │ │ -13.2 ms +- 433 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │ +8.86 ms +- 81 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
│ │ │ │ │  Note
│ │ │ │ │  Operations such as
│ │ │ │ │  1 and 2  # would parse to 1 & 2, but should evaluate to 2
│ │ │ │ │  3 or 4  # would parse to 3 | 4, but should evaluate to 3
│ │ │ │ │  ~1  # this is okay, but slower when using eval
│ │ │ │ │  should be performed in Python. An exception will be raised if you try to
│ │ │ │ │  perform any boolean/bitwise operations with scalar operands that are not of
│ │ │ ├── ./usr/share/doc/python-pandas-doc/html/user_guide/scale.html
│ │ │ │ @@ -916,16 +916,16 @@
│ │ │ │     ....: files = pathlib.Path("data/timeseries/").glob("ts*.parquet")
│ │ │ │     ....: counts = pd.Series(dtype=int)
│ │ │ │     ....: for path in files:
│ │ │ │     ....:     df = pd.read_parquet(path)
│ │ │ │     ....:     counts = counts.add(df["name"].value_counts(), fill_value=0)
│ │ │ │     ....: counts.astype(int)
│ │ │ │     ....: 
│ │ │ │ -CPU times: user 617 us, sys: 39 us, total: 656 us
│ │ │ │ -Wall time: 679 us
│ │ │ │ +CPU times: user 715 us, sys: 0 ns, total: 715 us
│ │ │ │ +Wall time: 732 us
│ │ │ │  Out[32]: Series([], dtype: int64)
│ │ │ │  
│ │ │ │
│ │ │ │

Some readers, like pandas.read_csv(), offer parameters to control the │ │ │ │ chunksize when reading a single file.

│ │ │ │

Manually chunking is an OK option for workflows that don’t │ │ │ │ require too sophisticated of operations. Some operations, like pandas.DataFrame.groupby(), are │ │ │ │ ├── html2text {} │ │ │ │ │ @@ -633,16 +633,16 @@ │ │ │ │ │ ....: files = pathlib.Path("data/timeseries/").glob("ts*.parquet") │ │ │ │ │ ....: counts = pd.Series(dtype=int) │ │ │ │ │ ....: for path in files: │ │ │ │ │ ....: df = pd.read_parquet(path) │ │ │ │ │ ....: counts = counts.add(df["name"].value_counts(), fill_value=0) │ │ │ │ │ ....: counts.astype(int) │ │ │ │ │ ....: │ │ │ │ │ -CPU times: user 617 us, sys: 39 us, total: 656 us │ │ │ │ │ -Wall time: 679 us │ │ │ │ │ +CPU times: user 715 us, sys: 0 ns, total: 715 us │ │ │ │ │ +Wall time: 732 us │ │ │ │ │ Out[32]: Series([], dtype: int64) │ │ │ │ │ Some readers, like _p_a_n_d_a_s_._r_e_a_d___c_s_v_(_), offer parameters to control the chunksize │ │ │ │ │ when reading a single file. │ │ │ │ │ Manually chunking is an OK option for workflows that don’t require too │ │ │ │ │ sophisticated of operations. Some operations, like _p_a_n_d_a_s_._D_a_t_a_F_r_a_m_e_._g_r_o_u_p_b_y_(_), │ │ │ │ │ are much harder to do chunkwise. In these cases, you may be better switching to │ │ │ │ │ a different library that implements these out-of-core algorithms for you. │ │ │ ├── ./usr/share/doc/python-pandas-doc/html/user_guide/style.ipynb.gz │ │ │ │ ├── style.ipynb │ │ │ │ │ ├── Pretty-printed │ │ │ │ │ │┄ Similarity: 0.9985610875706213% │ │ │ │ │ │┄ Differences: {"'cells'": "{1: {'metadata': {'execution': {'iopub.execute_input': '2024-09-09T16:40:59.567542Z', " │ │ │ │ │ │┄ "'iopub.status.busy': '2024-09-09T16:40:59.566415Z', 'iopub.status.idle': " │ │ │ │ │ │┄ "'2024-09-09T16:41:00.084217Z', 'shell.execute_reply': " │ │ │ │ │ │┄ "'2024-09-09T16:41:00.083249Z'}}}, 3: {'metadata': {'execution': " │ │ │ │ │ │┄ "{'iopub.execute_input': '2024-09-09T16:41:00.089505Z', 'iopub.status.busy': " │ │ │ │ │ │┄ "'2024-09-09T16:41:00.088820Z', 'iopub.status.idle': '2024-09-09T16:41:0 […] │ │ │ │ │ │ @@ -39,18 +39,18 @@ │ │ │ │ │ │ ] │ │ │ │ │ │ }, │ │ │ │ │ │ { │ │ │ │ │ │ "cell_type": "code", │ │ │ │ │ │ "execution_count": 1, │ │ │ │ │ │ "metadata": { │ │ │ │ │ │ "execution": { │ │ │ │ │ │ - "iopub.execute_input": "2025-10-13T00:15:39.786466Z", │ │ │ │ │ │ - "iopub.status.busy": "2025-10-13T00:15:39.786192Z", │ │ │ │ │ │ - "iopub.status.idle": "2025-10-13T00:15:40.326133Z", │ │ │ │ │ │ - "shell.execute_reply": "2025-10-13T00:15:40.325474Z" │ │ │ │ │ │ + "iopub.execute_input": "2024-09-09T16:40:59.567542Z", │ │ │ │ │ │ + "iopub.status.busy": "2024-09-09T16:40:59.566415Z", │ │ │ │ │ │ + "iopub.status.idle": "2024-09-09T16:41:00.084217Z", │ │ │ │ │ │ + "shell.execute_reply": "2024-09-09T16:41:00.083249Z" │ │ │ │ │ │ }, │ │ │ │ │ │ "nbsphinx": "hidden" │ │ │ │ │ │ }, │ │ │ │ │ │ "outputs": [], │ │ │ │ │ │ "source": [ │ │ │ │ │ │ "import matplotlib.pyplot\n", │ │ │ │ │ │ "# We have this here to trigger matplotlib's font cache stuff.\n", │ │ │ │ │ │ @@ -77,36 +77,36 @@ │ │ │ │ │ │ ] │ │ │ │ │ │ }, │ │ │ │ │ │ { │ │ │ │ │ │ "cell_type": "code", │ │ │ │ │ │ "execution_count": 2, │ │ │ │ │ │ "metadata": { │ │ │ │ │ │ "execution": { │ │ │ │ │ │ - "iopub.execute_input": "2025-10-13T00:15:40.330290Z", │ │ │ │ │ │ - "iopub.status.busy": "2025-10-13T00:15:40.329614Z", │ │ │ │ │ │ - "iopub.status.idle": "2025-10-13T00:15:40.652895Z", │ │ │ │ │ │ - "shell.execute_reply": "2025-10-13T00:15:40.652131Z" │ │ │ │ │ │ + "iopub.execute_input": "2024-09-09T16:41:00.089505Z", │ │ │ │ │ │ + "iopub.status.busy": "2024-09-09T16:41:00.088820Z", │ │ │ │ │ │ + "iopub.status.idle": "2024-09-09T16:41:00.348911Z", │ │ │ │ │ │ + "shell.execute_reply": "2024-09-09T16:41:00.347940Z" │ │ │ │ │ │ } │ │ │ │ │ │ }, │ │ │ │ │ │ "outputs": [], │ │ │ │ │ │ "source": [ │ │ │ │ │ │ "import pandas as pd\n", │ │ │ │ │ │ "import numpy as np\n", │ │ │ │ │ │ "import matplotlib as mpl\n" │ │ │ │ │ │ ] │ │ │ │ │ │ }, │ │ │ │ │ │ { │ │ │ │ │ │ "cell_type": "code", │ │ │ │ │ │ "execution_count": 3, │ │ │ │ │ │ "metadata": { │ │ │ │ │ │ "execution": { │ │ │ │ │ │ - "iopub.execute_input": "2025-10-13T00:15:40.656793Z", │ │ │ │ │ │ - "iopub.status.busy": "2025-10-13T00:15:40.656423Z", │ │ │ │ │ │ - "iopub.status.idle": "2025-10-13T00:15:40.717112Z", │ │ │ │ │ │ - "shell.execute_reply": "2025-10-13T00:15:40.716387Z" │ │ │ │ │ │ + "iopub.execute_input": "2024-09-09T16:41:00.353625Z", │ │ │ │ │ │ + "iopub.status.busy": "2024-09-09T16:41:00.353229Z", │ │ │ │ │ │ + "iopub.status.idle": "2024-09-09T16:41:00.413466Z", │ │ │ │ │ │ + "shell.execute_reply": "2024-09-09T16:41:00.412533Z" │ │ │ │ │ │ }, │ │ │ │ │ │ "nbsphinx": "hidden" │ │ │ │ │ │ }, │ │ │ │ │ │ "outputs": [], │ │ │ │ │ │ "source": [ │ │ │ │ │ │ "# For reproducibility - this doesn't respect uuid_len or positionally-passed uuid but the places here that use that coincidentally bypass this anyway\n", │ │ │ │ │ │ "from pandas.io.formats.style import Styler\n", │ │ │ │ │ │ @@ -123,18 +123,18 @@ │ │ │ │ │ │ ] │ │ │ │ │ │ }, │ │ │ │ │ │ { │ │ │ │ │ │ "cell_type": "code", │ │ │ │ │ │ "execution_count": 4, │ │ │ │ │ │ "metadata": { │ │ │ │ │ │ "execution": { │ │ │ │ │ │ - "iopub.execute_input": "2025-10-13T00:15:40.724580Z", │ │ │ │ │ │ - "iopub.status.busy": "2025-10-13T00:15:40.724197Z", │ │ │ │ │ │ - "iopub.status.idle": "2025-10-13T00:15:40.739478Z", │ │ │ │ │ │ - "shell.execute_reply": "2025-10-13T00:15:40.738831Z" │ │ │ │ │ │ + "iopub.execute_input": "2024-09-09T16:41:00.417808Z", │ │ │ │ │ │ + "iopub.status.busy": "2024-09-09T16:41:00.417414Z", │ │ │ │ │ │ + "iopub.status.idle": "2024-09-09T16:41:00.429050Z", │ │ │ │ │ │ + "shell.execute_reply": "2024-09-09T16:41:00.428150Z" │ │ │ │ │ │ } │ │ │ │ │ │ }, │ │ │ │ │ │ "outputs": [ │ │ │ │ │ │ { │ │ │ │ │ │ "data": { │ │ │ │ │ │ "text/html": [ │ │ │ │ │ │ "