- DataStore API
- 쿼리 구성
DataStore 쿼리 작성
DataStore는 최적화된 SQL 쿼리로 컴파일되는 SQL 스타일의 쿼리 작성 메서드를 제공합니다. 결과가 실제로 필요해질 때까지 모든 연산은 지연 평가됩니다.
쿼리 메서드 개요
| Method | SQL Equivalent | 설명 |
|---|---|---|
select(*cols) | SELECT cols | 컬럼 선택 |
filter(cond) | WHERE cond | 행 필터링 |
where(cond) | WHERE cond | filter의 별칭 |
sort(*cols) | ORDER BY cols | 행 정렬 |
orderby(*cols) | ORDER BY cols | sort의 별칭 |
limit(n) | LIMIT n | 행 수 제한 |
offset(n) | OFFSET n | 행 건너뛰기 |
distinct() | DISTINCT | 중복 제거 |
groupby(*cols) | GROUP BY cols | 행 그룹화 |
having(cond) | HAVING cond | 그룹 필터링 |
join(right, ...) | JOIN | DataStore 간 조인 |
union(other) | UNION | 결과 결합 |
선택
select
DataStore에서 특정 컬럼만 조회합니다.
select(*fields: Union[str, Expression]) -> DataStore
예제:
from chdb.datastore import DataStore
from pathlib import Path
Path("employees.csv").write_text("""\
name,age,city,salary,department,dept_id,status,email,manager_id,bonus
Alice,28,NYC,75000,Engineering,1,active,[email protected],3,5000
Bob,35,LA,85000,Engineering,1,active,[email protected],3,
Charlie,52,NYC,95000,Product,2,active,[email protected],,10000
Diana,32,SF,70000,Design,3,active,[email protected],3,3000
Eve,23,LA,48000,Product,2,inactive,[email protected],2,
""")
ds = DataStore.from_file("employees.csv")
# Select by column names
result = ds.select('name', 'age', 'salary')
# Select all columns
result = ds.select('*')
# Select with expressions
result = ds.select(
'name',
(ds['salary'] * 12).as_('annual_salary'),
ds['age'].as_('employee_age')
)
# Equivalent pandas style
result = ds[['name', 'age', 'salary']]
필터링
filter / where
조건에 따라 행을 필터링합니다. 두 메서드는 동일하게 동작합니다.
filter(condition) -> DataStore
where(condition) -> DataStore # alias
예시:
ds = DataStore.from_file("employees.csv")
# Single condition
result = ds.filter(ds['age'] > 30)
result = ds.where(ds['salary'] >= 50000)
# Multiple conditions (AND)
result = ds.filter((ds['age'] > 30) & (ds['department'] == 'Engineering'))
# Multiple conditions (OR)
result = ds.filter((ds['city'] == 'NYC') | (ds['city'] == 'LA'))
# NOT condition
result = ds.filter(~(ds['status'] == 'inactive'))
# String conditions
result = ds.filter(ds['name'].str.contains('John'))
result = ds.filter(ds['email'].str.endswith('@company.com'))
# NULL checks
result = ds.filter(ds['manager_id'].notnull())
result = ds.filter(ds['bonus'].isnull())
# IN condition
result = ds.filter(ds['department'].isin(['Engineering', 'Product', 'Design']))
# BETWEEN condition
result = ds.filter(ds['salary'].between(50000, 100000))
# Chained filters (AND)
result = (ds
.filter(ds['age'] > 25)
.filter(ds['salary'] > 50000)
.filter(ds['city'] == 'NYC')
)
Pandas 스타일 필터링
# Boolean indexing (equivalent to filter)
result = ds[ds['age'] > 30]
result = ds[(ds['age'] > 30) & (ds['salary'] > 50000)]
# Query method
result = ds.query('age > 30 and salary > 50000')
정렬
sort / orderby
행을 하나 이상의 컬럼 기준으로 정렬합니다.
sort(*fields, ascending=True) -> DataStore
orderby(*fields, ascending=True) -> DataStore # alias
예제:
ds = DataStore.from_file("employees.csv")
# Single column ascending
result = ds.sort('name')
# Single column descending
result = ds.sort('salary', ascending=False)
# Multiple columns
result = ds.sort('department', 'salary')
# Mixed order (use list for ascending parameter)
result = ds.sort('department', 'salary', ascending=[True, False])
# Pandas style
result = ds.sort_values('salary', ascending=False)
result = ds.sort_values(['department', 'salary'], ascending=[True, False])
결과 제한 및 페이지네이션
limit
결과로 반환되는 행 수를 제한합니다.
limit(n: int) -> DataStore
offset
처음 n개 행을 건너뜁니다.
offset(n: int) -> DataStore
예제:
ds = DataStore.from_file("employees.csv")
# First 10 rows
result = ds.limit(10)
# Skip first 100, take next 50
result = ds.offset(100).limit(50)
# Pandas style
result = ds.head(10)
result = ds.tail(10)
result = ds.iloc[100:150]
DISTINCT
distinct
중복 행을 제거합니다.
distinct(subset=None, keep='first') -> DataStore
예제:
from pathlib import Path
Path("events.csv").write_text("""\
user_id,event_type,timestamp
1,click,2024-01-15 10:30:00
2,view,2024-01-15 11:00:00
1,purchase,2024-01-15 11:30:00
3,click,2024-01-16 09:00:00
2,click,2024-01-16 10:00:00
""")
ds = DataStore.from_file("events.csv")
# Remove all duplicate rows
result = ds.distinct()
# Remove duplicates based on specific columns
result = ds.distinct(subset=['user_id', 'event_type'])
# Pandas style
result = ds.drop_duplicates()
result = ds.drop_duplicates(subset=['user_id'])
그룹화
groupby
하나 이상의 컬럼을 기준으로 행을 그룹화합니다. LazyGroupBy 객체를 반환합니다.
groupby(*fields, sort=True, as_index=True, dropna=True) -> LazyGroupBy
예제:
from pathlib import Path
Path("sales.csv").write_text("""\
region,product,category,amount,quantity,price,date,order_id
East,Widget,Electronics,5200,10,120,2024-01-15,1001
West,Gadget,Electronics,800,5,160,2024-02-20,1002
East,Gizmo,Home,6500,3,100,2024-03-10,1003
North,Widget,Electronics,4500,6,150,2024-06-18,1004
West,Gadget,Electronics,2000,8,250,2024-09-14,1005
""")
ds = DataStore.from_file("sales.csv")
# Group by single column
by_region = ds.groupby('region')
# Group by multiple columns
by_region_product = ds.groupby('region', 'product')
# Aggregation after groupby
result = ds.groupby('region')['amount'].sum()
result = ds.groupby('region').agg({'amount': 'sum', 'quantity': 'mean'})
# Multiple aggregations
result = ds.groupby('category').agg({
'price': ['min', 'max', 'mean'],
'quantity': 'sum'
})
# Named aggregation
result = ds.groupby('region').agg(
total_amount=('amount', 'sum'),
avg_quantity=('quantity', 'mean'),
order_count=('order_id', 'count')
)
having
집계가 완료된 후 그룹을 필터링합니다.
having(condition: Union[Condition, str]) -> DataStore
예제:
# Filter groups with total > 10000
result = (ds
.groupby('region')
.agg({'amount': 'sum'})
.having(ds['sum'] > 10000)
)
# Using SQL-style having
result = (ds
.select('region', 'SUM(amount) as total')
.groupby('region')
.having('total > 10000')
)
조인
join
두 DataStore를 조인합니다.
join(right, on=None, how='inner', left_on=None, right_on=None) -> DataStore
매개변수:
| Parameter | Type | Default | Description |
|---|---|---|---|
right | DataStore | required | 조인할 오른쪽 DataStore |
on | str/list | None | 조인에 사용할 컬럼 |
how | str | 'inner' | 조인 유형: 'inner', 'left', 'right', 'outer' |
left_on | str/list | None | 왼쪽 조인에서 사용할 컬럼 |
right_on | str/list | None | 오른쪽 조인에서 사용할 컬럼 |
예시:
from pathlib import Path
Path("departments.csv").write_text("""\
dept_id,department_name
1,Engineering
2,Product
3,Design
""")
employees = DataStore.from_file("employees.csv")
departments = DataStore.from_file("departments.csv")
# Inner join on single column
result = employees.join(departments, on='dept_id')
# Left join
result = employees.join(departments, on='dept_id', how='left')
# Join on different column names
result = employees.join(
departments,
left_on='department_id',
right_on='id',
how='inner'
)
# Pandas style merge
from chdb import datastore as pd
result = pd.merge(employees, departments, on='dept_id')
result = pd.merge(employees, departments, left_on='department_id', right_on='id')
union
두 개의 DataStore 결과를 합칩니다.
union(other, all=False) -> DataStore
예시:
from pathlib import Path
Path("sales_2023.csv").write_text("""\
region,product,amount,date
East,Widget,1200,2023-06-15
West,Gadget,800,2023-09-20
North,Gizmo,600,2023-11-10
""")
Path("sales_2024.csv").write_text("""\
region,product,amount,date
East,Widget,1500,2024-03-10
North,Gizmo,900,2024-07-22
West,Gadget,1100,2024-05-05
""")
ds1 = DataStore.from_file("sales_2023.csv")
ds2 = DataStore.from_file("sales_2024.csv")
# UNION (removes duplicates)
result = ds1.union(ds2)
# UNION ALL (keeps duplicates)
result = ds1.union(ds2, all=True)
# Pandas style
from chdb import datastore as pd
result = pd.concat([ds1, ds2])
조건식
when
CASE WHEN 표현식을 생성합니다.
when(condition, value) -> CaseWhenBuilder
예시:
ds = DataStore.from_file("employees.csv")
# Simple case-when
result = ds.select(
'name',
ds.when(ds['salary'] > 100000, 'High')
.when(ds['salary'] > 50000, 'Medium')
.otherwise('Low')
.as_('salary_tier')
)
# With column assignment
ds['salary_tier'] = (
ds.when(ds['salary'] > 100000, 'High')
.when(ds['salary'] > 50000, 'Medium')
.otherwise('Low')
)
원시 SQL
run_sql / sql
원시 SQL 쿼리를 실행합니다.
run_sql(query: str) -> DataStore
sql(query: str) -> DataStore # alias
예시:
from chdb.datastore import DataStore
# Execute raw SQL
result = DataStore().sql("""
SELECT
department,
COUNT(*) as count,
AVG(salary) as avg_salary
FROM file('employees.csv', 'CSVWithNames')
WHERE status = 'active'
GROUP BY department
HAVING count > 5
ORDER BY avg_salary DESC
LIMIT 10
""")
# SQL on existing DataStore
ds = DataStore.from_file("employees.csv")
result = ds.sql("SELECT * FROM __table__ WHERE age > 30")
to_sql
생성된 SQL을 실행하지 않고 확인할 수 있습니다.
to_sql(**kwargs) -> str
예제:
ds = DataStore.from_file("employees.csv")
query = (ds
.filter(ds['age'] > 30)
.groupby('department')
.agg({'salary': 'mean'})
.sort('mean', ascending=False)
)
print(query.to_sql())
# Output:
# SELECT department, AVG(salary) AS mean
# FROM file('employees.csv', 'CSVWithNames')
# WHERE age > 30
# GROUP BY department
# ORDER BY mean DESC
메서드 체이닝
모든 쿼리 메서드는 메서드 체이닝 방식으로 연속 호출을 지원합니다:
from chdb.datastore import DataStore
ds = DataStore.from_file("sales.csv")
result = (ds
.select('region', 'product', 'amount', 'date')
.filter(ds['date'] >= '2024-01-01')
.filter(ds['amount'] > 100)
.groupby('region', 'product')
.agg({
'amount': ['sum', 'mean'],
'date': 'count'
})
.having(ds['sum'] > 10000)
.sort('sum', ascending=False)
.limit(20)
)
# View SQL
print(result.to_sql())
# Execute
df = result.to_df()
별칭 지정
as_
컬럼이나 서브쿼리에 대한 별칭을 설정합니다.
as_(alias: str) -> DataStore
예제:
# Column alias
result = ds.select(
ds['name'].as_('employee_name'),
(ds['salary'] * 12).as_('annual_salary')
)
# Subquery alias
subquery = ds.filter(ds['age'] > 30).as_('senior_employees')