Table of Contents
- Overview
- Installation
- Basic Usage
- Data Operations
- Query Operations
- File Operations
- Advanced Features
- Best Practices
Overview
DuckDB is an embedded analytical database with a powerful CLI, designed for fast analytics on structured data, particularly effective for data science and analytics workflows.
Key Features
- OLAP database operations
- SQL interface
- Direct file querying
- Parallel processing
- Vector operations
- JSON/CSV/Parquet support
- Python integration
- In-memory processing
Installation
# Ubuntu/Debian
sudo apt-get install duckdb
# macOS
brew install duckdb
# Using pip (Python interface)
pip install duckdb
# CLI Access
duckdb
Basic Usage
Starting DuckDB
# Start interactive shell
duckdb
# Start with database file
duckdb database.db
# Execute single query
duckdb -c "SELECT * FROM table"
# Execute script
duckdb < script.sql
Basic Commands
-- Show tables
.tables
-- Show schema
.schema table_name
-- Show system tables
.system
-- Exit
.exit
Data Operations
Table Creation
-- Create table
CREATE TABLE users (
id INTEGER PRIMARY KEY,
name VARCHAR,
age INTEGER
);
-- Create from CSV
CREATE TABLE users AS
SELECT * FROM read_csv_auto('users.csv');
-- Create from Parquet
CREATE TABLE data AS
SELECT * FROM read_parquet('data.parquet');
Data Import
-- Import CSV
COPY users FROM 'users.csv' (DELIMITER ',', HEADER);
-- Import JSON
SELECT * FROM read_json_auto('data.json');
-- Import Excel
SELECT * FROM read_excel('data.xlsx');
-- Import multiple files
SELECT * FROM read_csv_auto('*.csv');
Query Operations
Basic Queries
-- Select data
SELECT * FROM users WHERE age > 25;
-- Aggregations
SELECT
department,
COUNT(*) as count,
AVG(salary) as avg_salary
FROM employees
GROUP BY department;
-- Window functions
SELECT *,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC)
FROM employees;
Advanced Analytics
-- Time series analysis
SELECT
date_trunc('month', date) as month,
SUM(amount) as total
FROM transactions
GROUP BY 1
ORDER BY 1;
-- Moving averages
SELECT *,
AVG(value) OVER (
ORDER BY date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as moving_avg
FROM metrics;
-- Correlation analysis
SELECT
corr(price, quantity) as price_quantity_correlation
FROM sales;
File Operations
Direct File Querying
-- Query CSV directly
SELECT * FROM read_csv_auto('data.csv')
WHERE column > 100;
-- Query Parquet
SELECT * FROM read_parquet('data.parquet')
WHERE date >= '2023-01-01';
-- Query multiple files
SELECT * FROM read_csv_auto('data_*.csv')
WHERE category = 'A';
Export Operations
-- Export to CSV
COPY (SELECT * FROM users) TO 'users.csv' WITH (HEADER 1);
-- Export to Parquet
COPY (SELECT * FROM users) TO 'users.parquet' (FORMAT PARQUET);
-- Export query results
COPY (
SELECT department, AVG(salary)
FROM employees
GROUP BY department
) TO 'dept_salaries.csv' WITH (HEADER 1);
Advanced Features
Parallel Processing
-- Set threads
SET threads TO 4;
-- Parallel query
SELECT * FROM read_csv_auto('large_file.csv')
WHERE value > 1000
PARALLEL 4;
Window Functions
-- Ranking
SELECT *,
RANK() OVER (PARTITION BY category ORDER BY value DESC) as rank
FROM data;
-- Running totals
SELECT *,
SUM(amount) OVER (ORDER BY date) as running_total
FROM transactions;
JSON Operations
-- Parse JSON
SELECT json_extract(data, '$.key') as value
FROM json_table;
-- Create JSON
SELECT json_object('name', name, 'age', age)
FROM users;
Best Practices
Performance Optimization
-- Create indexes
CREATE INDEX idx_user_id ON users(id);
-- Use PRAGMA statements
PRAGMA threads=4;
PRAGMA memory_limit='4GB';
-- Optimize joins
SELECT /*+ JOIN_ORDER(users, orders) */ *
FROM users
JOIN orders ON users.id = orders.user_id;
Memory Management
-- Set memory limit
SET memory_limit='8GB';
-- Monitor memory usage
SELECT * FROM pragma_database_size();
-- Clear memory
PRAGMA memory_clear();
Example Scripts
Data Processing Pipeline
-- data_pipeline.sql
-- Process sales data
-- Create staging table
CREATE TABLE staging AS
SELECT * FROM read_csv_auto('raw_sales.csv');
-- Clean and transform
CREATE TABLE cleaned_sales AS
SELECT
date,
COALESCE(product_id, 'UNKNOWN') as product_id,
ROUND(amount, 2) as amount,
CASE
WHEN quantity < 0 THEN 0
ELSE quantity
END as quantity
FROM staging
WHERE date IS NOT NULL;
-- Create summary
CREATE TABLE sales_summary AS
SELECT
date_trunc('month', date) as month,
product_id,
SUM(amount) as total_amount,
SUM(quantity) as total_quantity,
COUNT(*) as transaction_count
FROM cleaned_sales
GROUP BY 1, 2
ORDER BY 1, 2;
-- Export results
COPY (
SELECT * FROM sales_summary
) TO 'sales_summary.parquet' (FORMAT PARQUET);
Analytics Report Generator
#!/bin/bash
# Generate analytics report
DB="analytics.db"
REPORT_DIR="reports"
DATE=$(date +%Y%m%d)
mkdir -p "$REPORT_DIR"
# Execute analytics queries
duckdb "$DB" << EOF
-- Daily metrics
COPY (
SELECT
date,
COUNT(*) as transactions,
SUM(amount) as total_amount,
AVG(amount) as avg_amount
FROM transactions
WHERE date >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY date
ORDER BY date
) TO '${REPORT_DIR}/daily_metrics_${DATE}.csv' WITH (HEADER 1);
-- Product performance
COPY (
SELECT
p.category,
COUNT(*) as sales_count,
SUM(t.amount) as total_revenue,
AVG(t.amount) as avg_transaction
FROM transactions t
JOIN products p ON t.product_id = p.id
GROUP BY p.category
ORDER BY total_revenue DESC
) TO '${REPORT_DIR}/product_performance_${DATE}.csv' WITH (HEADER 1);
EOF
Data Quality Check
-- data_quality.sql
-- Perform data quality checks
-- Check for nulls
SELECT
column_name,
COUNT(*) as null_count
FROM (
SELECT * FROM information_schema.columns
WHERE table_name = 'target_table'
) as cols
CROSS JOIN (
SELECT * FROM target_table
) as data
WHERE data[cols.column_name] IS NULL
GROUP BY column_name
HAVING null_count > 0;
-- Check for duplicates
SELECT
id,
COUNT(*) as duplicate_count
FROM target_table
GROUP BY id
HAVING COUNT(*) > 1;
-- Value range checks
SELECT
MIN(value) as min_value,
MAX(value) as max_value,
AVG(value) as avg_value,
STDDEV(value) as stddev_value
FROM numeric_table
WHERE value IS NOT NULL;
Remember:
- Use appropriate data types
- Create indexes for large tables
- Leverage parallel processing
- Monitor memory usage
- Regular backups
- Document queries and transformations
For detailed information, consult the DuckDB documentation (https://duckdb.org/docs/).