.. currentmodule:: pandas

.. ipython:: python
   :suppress:

   import os
   import csv
   from pandas.compat import StringIO, BytesIO
   import pandas as pd
   ExcelWriter = pd.ExcelWriter

   import sys
   reload(sys)  # Reload does the trick!
   sys.setdefaultencoding('UTF8')
   
   import numpy as np
   np.random.seed(123456)
   randn = np.random.randn
   np.set_printoptions(precision=4, suppress=True)

   import matplotlib.pyplot as plt
   plt.close('all')

   import pandas.util.testing as tm
   pd.options.display.max_rows=15
   clipdf = pd.DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['p','q','r']},
                         index=['x','y','z'])

.. _io.perf:

Performance Considerations
--------------------------

This is an informal comparison of various IO methods, using pandas 0.13.1.

.. code-block:: ipython

   In [1]: df = pd.DataFrame(randn(1000000,2),columns=list('AB'))

   In [2]: df.info()
   <class 'pandas.core.frame.DataFrame'>
   Int64Index: 1000000 entries, 0 to 999999
   Data columns (total 2 columns):
   A    1000000 non-null float64
   B    1000000 non-null float64
   dtypes: float64(2)
   memory usage: 22.9 MB

Writing

.. code-block:: ipython

   In [14]: %timeit test_sql_write(df)
   1 loops, best of 3: 6.24 s per loop

   In [15]: %timeit test_hdf_fixed_write(df)
   1 loops, best of 3: 237 ms per loop

   In [26]: %timeit test_hdf_fixed_write_compress(df)
   1 loops, best of 3: 245 ms per loop

   In [16]: %timeit test_hdf_table_write(df)
   1 loops, best of 3: 901 ms per loop

   In [27]: %timeit test_hdf_table_write_compress(df)
   1 loops, best of 3: 952 ms per loop

   In [17]: %timeit test_csv_write(df)
   1 loops, best of 3: 3.44 s per loop

Reading

.. code-block:: ipython

   In [18]: %timeit test_sql_read()
   1 loops, best of 3: 766 ms per loop

   In [19]: %timeit test_hdf_fixed_read()
   10 loops, best of 3: 19.1 ms per loop

   In [28]: %timeit test_hdf_fixed_read_compress()
   10 loops, best of 3: 36.3 ms per loop

   In [20]: %timeit test_hdf_table_read()
   10 loops, best of 3: 39 ms per loop

   In [29]: %timeit test_hdf_table_read_compress()
   10 loops, best of 3: 60.6 ms per loop

   In [22]: %timeit test_csv_read()
   1 loops, best of 3: 620 ms per loop

Space on disk (in bytes)

.. code-block:: none

    25843712 Apr  8 14:11 test.sql
    24007368 Apr  8 14:11 test_fixed.hdf
    15580682 Apr  8 14:11 test_fixed_compress.hdf
    24458444 Apr  8 14:11 test_table.hdf
    16797283 Apr  8 14:11 test_table_compress.hdf
    46152810 Apr  8 14:11 test.csv

And here's the code

.. code-block:: python

   import sqlite3
   import os
   from pandas.io import sql

   df = pd.DataFrame(randn(1000000,2),columns=list('AB'))

   def test_sql_write(df):
       if os.path.exists('test.sql'):
           os.remove('test.sql')
       sql_db = sqlite3.connect('test.sql')
       df.to_sql(name='test_table', con=sql_db)
       sql_db.close()

   def test_sql_read():
       sql_db = sqlite3.connect('test.sql')
       pd.read_sql_query("select * from test_table", sql_db)
       sql_db.close()

   def test_hdf_fixed_write(df):
       df.to_hdf('test_fixed.hdf','test',mode='w')

   def test_hdf_fixed_read():
       pd.read_hdf('test_fixed.hdf','test')

   def test_hdf_fixed_write_compress(df):
       df.to_hdf('test_fixed_compress.hdf','test',mode='w',complib='blosc')

   def test_hdf_fixed_read_compress():
       pd.read_hdf('test_fixed_compress.hdf','test')

   def test_hdf_table_write(df):
       df.to_hdf('test_table.hdf','test',mode='w',format='table')

   def test_hdf_table_read():
       pd.read_hdf('test_table.hdf','test')

   def test_hdf_table_write_compress(df):
       df.to_hdf('test_table_compress.hdf','test',mode='w',complib='blosc',format='table')

   def test_hdf_table_read_compress():
       pd.read_hdf('test_table_compress.hdf','test')

   def test_csv_write(df):
       df.to_csv('test.csv',mode='w')

   def test_csv_read():
       pd.read_csv('test.csv',index_col=0)