.. currentmodule:: pandas .. ipython:: python :suppress: import os import csv from pandas.compat import StringIO, BytesIO import pandas as pd ExcelWriter = pd.ExcelWriter import sys reload(sys) # Reload does the trick! sys.setdefaultencoding('UTF8') import numpy as np np.random.seed(123456) randn = np.random.randn np.set_printoptions(precision=4, suppress=True) import matplotlib.pyplot as plt plt.close('all') import pandas.util.testing as tm pd.options.display.max_rows=15 clipdf = pd.DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['p','q','r']}, index=['x','y','z']) .. _io.perf: Performance Considerations -------------------------- This is an informal comparison of various IO methods, using pandas 0.13.1. .. code-block:: ipython In [1]: df = pd.DataFrame(randn(1000000,2),columns=list('AB')) In [2]: df.info() Int64Index: 1000000 entries, 0 to 999999 Data columns (total 2 columns): A 1000000 non-null float64 B 1000000 non-null float64 dtypes: float64(2) memory usage: 22.9 MB Writing .. code-block:: ipython In [14]: %timeit test_sql_write(df) 1 loops, best of 3: 6.24 s per loop In [15]: %timeit test_hdf_fixed_write(df) 1 loops, best of 3: 237 ms per loop In [26]: %timeit test_hdf_fixed_write_compress(df) 1 loops, best of 3: 245 ms per loop In [16]: %timeit test_hdf_table_write(df) 1 loops, best of 3: 901 ms per loop In [27]: %timeit test_hdf_table_write_compress(df) 1 loops, best of 3: 952 ms per loop In [17]: %timeit test_csv_write(df) 1 loops, best of 3: 3.44 s per loop Reading .. code-block:: ipython In [18]: %timeit test_sql_read() 1 loops, best of 3: 766 ms per loop In [19]: %timeit test_hdf_fixed_read() 10 loops, best of 3: 19.1 ms per loop In [28]: %timeit test_hdf_fixed_read_compress() 10 loops, best of 3: 36.3 ms per loop In [20]: %timeit test_hdf_table_read() 10 loops, best of 3: 39 ms per loop In [29]: %timeit test_hdf_table_read_compress() 10 loops, best of 3: 60.6 ms per loop In [22]: %timeit test_csv_read() 1 loops, best of 3: 620 ms per loop Space on disk (in bytes) .. code-block:: none 25843712 Apr 8 14:11 test.sql 24007368 Apr 8 14:11 test_fixed.hdf 15580682 Apr 8 14:11 test_fixed_compress.hdf 24458444 Apr 8 14:11 test_table.hdf 16797283 Apr 8 14:11 test_table_compress.hdf 46152810 Apr 8 14:11 test.csv And here's the code .. code-block:: python import sqlite3 import os from pandas.io import sql df = pd.DataFrame(randn(1000000,2),columns=list('AB')) def test_sql_write(df): if os.path.exists('test.sql'): os.remove('test.sql') sql_db = sqlite3.connect('test.sql') df.to_sql(name='test_table', con=sql_db) sql_db.close() def test_sql_read(): sql_db = sqlite3.connect('test.sql') pd.read_sql_query("select * from test_table", sql_db) sql_db.close() def test_hdf_fixed_write(df): df.to_hdf('test_fixed.hdf','test',mode='w') def test_hdf_fixed_read(): pd.read_hdf('test_fixed.hdf','test') def test_hdf_fixed_write_compress(df): df.to_hdf('test_fixed_compress.hdf','test',mode='w',complib='blosc') def test_hdf_fixed_read_compress(): pd.read_hdf('test_fixed_compress.hdf','test') def test_hdf_table_write(df): df.to_hdf('test_table.hdf','test',mode='w',format='table') def test_hdf_table_read(): pd.read_hdf('test_table.hdf','test') def test_hdf_table_write_compress(df): df.to_hdf('test_table_compress.hdf','test',mode='w',complib='blosc',format='table') def test_hdf_table_read_compress(): pd.read_hdf('test_table_compress.hdf','test') def test_csv_write(df): df.to_csv('test.csv',mode='w') def test_csv_read(): pd.read_csv('test.csv',index_col=0)