Developer guidesFor pandas UsersPandas CookbookEdit this pagePandas Cookbook Common pandas patterns and their DataStore equivalents. Most code works unchanged! Data Loading Read CSV # Pandas import pandas as pd df = pd.read_csv("data.csv") # DataStore - same! from chdb import datastore as pd df = pd.read_csv("data.csv") Read Multiple Files # Pandas import glob dfs = [pd.read_csv(f) for f in glob.glob("data/*.csv")] df = pd.concat(dfs) # DataStore - more efficient with glob pattern df = pd.read_csv("data/*.csv") Filtering Single Condition # Pandas and DataStore - identical df[df['age'] > 25] df[df['city'] == 'NYC'] df[df['name'].str.contains('John')] Multiple Conditions # AND df[(df['age'] > 25) & (df['city'] == 'NYC')] # OR df[(df['age'] < 18) | (df['age'] > 65)] # NOT df[~(df['status'] == 'inactive')] Using query() # Pandas and DataStore - identical df.query('age > 25 and city == "NYC"') df.query('salary > 50000') isin() # Pandas and DataStore - identical df[df['city'].isin(['NYC', 'LA', 'SF'])] between() # Pandas and DataStore - identical df[df['age'].between(18, 65)] Selecting Columns Single Column # Pandas and DataStore - identical df['name'] df.name # attribute access Multiple Columns # Pandas and DataStore - identical df[['name', 'age', 'city']] Select and Filter # Pandas and DataStore - identical df[df['age'] > 25][['name', 'salary']] # DataStore also supports SQL-style df.filter(df['age'] > 25).select('name', 'salary') Sorting Single Column # Pandas and DataStore - identical df.sort_values('salary') df.sort_values('salary', ascending=False) Multiple Columns # Pandas and DataStore - identical df.sort_values(['city', 'salary'], ascending=[True, False]) Get Top/Bottom N # Pandas and DataStore - identical df.nlargest(10, 'salary') df.nsmallest(5, 'age') GroupBy and Aggregation Simple GroupBy # Pandas and DataStore - identical df.groupby('city')['salary'].mean() df.groupby('city')['salary'].sum() df.groupby('city').size() # count Multiple Aggregations # Pandas and DataStore - identical df.groupby('city')['salary'].agg(['sum', 'mean', 'count']) df.groupby('city').agg({ 'salary': ['sum', 'mean'], 'age': ['min', 'max'] }) Named Aggregations # Pandas and DataStore - identical df.groupby('city').agg( total_salary=('salary', 'sum'), avg_salary=('salary', 'mean'), employee_count=('id', 'count') ) Multiple GroupBy Keys # Pandas and DataStore - identical df.groupby(['city', 'department'])['salary'].mean() Joining Data Inner Join # Pandas pd.merge(df1, df2, on='id') # DataStore - same API pd.merge(df1, df2, on='id') # DataStore also supports df1.join(df2, on='id') Left Join # Pandas and DataStore - identical pd.merge(df1, df2, on='id', how='left') Join on Different Columns # Pandas and DataStore - identical pd.merge(df1, df2, left_on='emp_id', right_on='id') Concat # Pandas and DataStore - identical pd.concat([df1, df2, df3]) pd.concat([df1, df2], axis=1) String Operations Case Conversion # Pandas and DataStore - identical df['name'].str.upper() df['name'].str.lower() df['name'].str.title() Substring # Pandas and DataStore - identical df['name'].str[:3] # First 3 characters df['name'].str.slice(0, 3) Search # Pandas and DataStore - identical df['name'].str.contains('John') df['name'].str.startswith('A') df['name'].str.endswith('son') Replace # Pandas and DataStore - identical df['text'].str.replace('old', 'new') df['text'].str.replace(r'\d+', '', regex=True) # Remove digits Split # Pandas and DataStore - identical df['name'].str.split(' ') df['name'].str.split(' ', expand=True) Length # Pandas and DataStore - identical df['name'].str.len() DateTime Operations Extract Components # Pandas and DataStore - identical df['date'].dt.year df['date'].dt.month df['date'].dt.day df['date'].dt.dayofweek df['date'].dt.hour Formatting # Pandas and DataStore - identical df['date'].dt.strftime('%Y-%m-%d') Missing Data Check Missing # Pandas and DataStore - identical df['col'].isna() df['col'].notna() df.isna().sum() Drop Missing # Pandas and DataStore - identical df.dropna() df.dropna(subset=['col1', 'col2']) Fill Missing # Pandas and DataStore - identical df.fillna(0) df.fillna({'col1': 0, 'col2': 'Unknown'}) df.fillna(method='ffill') Creating New Columns Simple Assignment # Pandas and DataStore - identical df['total'] = df['price'] * df['quantity'] df['age_group'] = df['age'] // 10 * 10 Using assign() # Pandas and DataStore - identical df = df.assign( total=df['price'] * df['quantity'], is_adult=df['age'] >= 18 ) Conditional (where/mask) # Pandas and DataStore - identical df['status'] = df['age'].where(df['age'] >= 18, 'minor') apply() for Custom Logic # Works, but triggers pandas execution df['category'] = df['amount'].apply(lambda x: 'high' if x > 1000 else 'low') # DataStore alternative (stays lazy) df['category'] = ( df.when(df['amount'] > 1000, 'high') .otherwise('low') ) Reshaping Pivot Table # Pandas and DataStore - identical df.pivot_table( values='amount', index='region', columns='product', aggfunc='sum' ) Melt (Unpivot) # Pandas and DataStore - identical df.melt( id_vars=['name'], value_vars=['score1', 'score2', 'score3'], var_name='test', value_name='score' ) Explode # Pandas and DataStore - identical df.explode('tags') # Expand array column Window Functions Rolling # Pandas and DataStore - identical df['rolling_avg'] = df['price'].rolling(window=7).mean() df['rolling_sum'] = df['amount'].rolling(window=30).sum() Expanding # Pandas and DataStore - identical df['cumsum'] = df['amount'].expanding().sum() df['cummax'] = df['amount'].expanding().max() Shift # Pandas and DataStore - identical df['prev_value'] = df['value'].shift(1) # Lag df['next_value'] = df['value'].shift(-1) # Lead Diff # Pandas and DataStore - identical df['change'] = df['value'].diff() df['pct_change'] = df['value'].pct_change() Output To CSV # Pandas and DataStore - identical df.to_csv("output.csv", index=False) To Parquet # Pandas and DataStore - identical df.to_parquet("output.parquet") To pandas DataFrame # DataStore specific pandas_df = ds.to_df() pandas_df = ds.to_pandas() DataStore Extras View SQL # DataStore only print(ds.to_sql()) Explain Plan # DataStore only ds.explain() ClickHouse Functions # DataStore only - extra accessors df['domain'] = df['url'].url.domain() df['json_value'] = df['data'].json.get_string('key') df['ip_valid'] = df['ip'].ip.is_ipv4_string() Universal URI # DataStore only - read from anywhere ds = DataStore.uri("s3://bucket/data.parquet") ds = DataStore.uri("mysql://user:pass@host/db/table")