Table of Contents

Overview

DuckDB is an embedded analytical database with a powerful CLI, designed for fast analytics on structured data, particularly effective for data science and analytics workflows.

Key Features

  • OLAP database operations
  • SQL interface
  • Direct file querying
  • Parallel processing
  • Vector operations
  • JSON/CSV/Parquet support
  • Python integration
  • In-memory processing

Installation

# Ubuntu/Debian
sudo apt-get install duckdb
 
# macOS
brew install duckdb
 
# Using pip (Python interface)
pip install duckdb
 
# CLI Access
duckdb

Basic Usage

Starting DuckDB

# Start interactive shell
duckdb
 
# Start with database file
duckdb database.db
 
# Execute single query
duckdb -c "SELECT * FROM table"
 
# Execute script
duckdb < script.sql

Basic Commands

-- Show tables
.tables
 
-- Show schema
.schema table_name
 
-- Show system tables
.system
 
-- Exit
.exit

Data Operations

Table Creation

-- Create table
CREATE TABLE users (
    id INTEGER PRIMARY KEY,
    name VARCHAR,
    age INTEGER
);
 
-- Create from CSV
CREATE TABLE users AS 
SELECT * FROM read_csv_auto('users.csv');
 
-- Create from Parquet
CREATE TABLE data AS 
SELECT * FROM read_parquet('data.parquet');

Data Import

-- Import CSV
COPY users FROM 'users.csv' (DELIMITER ',', HEADER);
 
-- Import JSON
SELECT * FROM read_json_auto('data.json');
 
-- Import Excel
SELECT * FROM read_excel('data.xlsx');
 
-- Import multiple files
SELECT * FROM read_csv_auto('*.csv');

Query Operations

Basic Queries

-- Select data
SELECT * FROM users WHERE age > 25;
 
-- Aggregations
SELECT 
    department,
    COUNT(*) as count,
    AVG(salary) as avg_salary
FROM employees
GROUP BY department;
 
-- Window functions
SELECT *,
    ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC)
FROM employees;

Advanced Analytics

-- Time series analysis
SELECT 
    date_trunc('month', date) as month,
    SUM(amount) as total
FROM transactions
GROUP BY 1
ORDER BY 1;
 
-- Moving averages
SELECT *,
    AVG(value) OVER (
        ORDER BY date 
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) as moving_avg
FROM metrics;
 
-- Correlation analysis
SELECT 
    corr(price, quantity) as price_quantity_correlation
FROM sales;

File Operations

Direct File Querying

-- Query CSV directly
SELECT * FROM read_csv_auto('data.csv')
WHERE column > 100;
 
-- Query Parquet
SELECT * FROM read_parquet('data.parquet')
WHERE date >= '2023-01-01';
 
-- Query multiple files
SELECT * FROM read_csv_auto('data_*.csv')
WHERE category = 'A';

Export Operations

-- Export to CSV
COPY (SELECT * FROM users) TO 'users.csv' WITH (HEADER 1);
 
-- Export to Parquet
COPY (SELECT * FROM users) TO 'users.parquet' (FORMAT PARQUET);
 
-- Export query results
COPY (
    SELECT department, AVG(salary) 
    FROM employees 
    GROUP BY department
) TO 'dept_salaries.csv' WITH (HEADER 1);

Advanced Features

Parallel Processing

-- Set threads
SET threads TO 4;
 
-- Parallel query
SELECT * FROM read_csv_auto('large_file.csv')
WHERE value > 1000
PARALLEL 4;

Window Functions

-- Ranking
SELECT *,
    RANK() OVER (PARTITION BY category ORDER BY value DESC) as rank
FROM data;
 
-- Running totals
SELECT *,
    SUM(amount) OVER (ORDER BY date) as running_total
FROM transactions;

JSON Operations

-- Parse JSON
SELECT json_extract(data, '$.key') as value
FROM json_table;
 
-- Create JSON
SELECT json_object('name', name, 'age', age)
FROM users;

Best Practices

Performance Optimization

-- Create indexes
CREATE INDEX idx_user_id ON users(id);
 
-- Use PRAGMA statements
PRAGMA threads=4;
PRAGMA memory_limit='4GB';
 
-- Optimize joins
SELECT /*+ JOIN_ORDER(users, orders) */ *
FROM users 
JOIN orders ON users.id = orders.user_id;

Memory Management

-- Set memory limit
SET memory_limit='8GB';
 
-- Monitor memory usage
SELECT * FROM pragma_database_size();
 
-- Clear memory
PRAGMA memory_clear();

Example Scripts

Data Processing Pipeline

-- data_pipeline.sql
-- Process sales data
 
-- Create staging table
CREATE TABLE staging AS
SELECT * FROM read_csv_auto('raw_sales.csv');
 
-- Clean and transform
CREATE TABLE cleaned_sales AS
SELECT 
    date,
    COALESCE(product_id, 'UNKNOWN') as product_id,
    ROUND(amount, 2) as amount,
    CASE 
        WHEN quantity < 0 THEN 0 
        ELSE quantity 
    END as quantity
FROM staging
WHERE date IS NOT NULL;
 
-- Create summary
CREATE TABLE sales_summary AS
SELECT 
    date_trunc('month', date) as month,
    product_id,
    SUM(amount) as total_amount,
    SUM(quantity) as total_quantity,
    COUNT(*) as transaction_count
FROM cleaned_sales
GROUP BY 1, 2
ORDER BY 1, 2;
 
-- Export results
COPY (
    SELECT * FROM sales_summary
) TO 'sales_summary.parquet' (FORMAT PARQUET);

Analytics Report Generator

#!/bin/bash
# Generate analytics report
 
DB="analytics.db"
REPORT_DIR="reports"
DATE=$(date +%Y%m%d)
 
mkdir -p "$REPORT_DIR"
 
# Execute analytics queries
duckdb "$DB" << EOF
-- Daily metrics
COPY (
    SELECT 
        date,
        COUNT(*) as transactions,
        SUM(amount) as total_amount,
        AVG(amount) as avg_amount
    FROM transactions
    WHERE date >= CURRENT_DATE - INTERVAL '30 days'
    GROUP BY date
    ORDER BY date
) TO '${REPORT_DIR}/daily_metrics_${DATE}.csv' WITH (HEADER 1);
 
-- Product performance
COPY (
    SELECT 
        p.category,
        COUNT(*) as sales_count,
        SUM(t.amount) as total_revenue,
        AVG(t.amount) as avg_transaction
    FROM transactions t
    JOIN products p ON t.product_id = p.id
    GROUP BY p.category
    ORDER BY total_revenue DESC
) TO '${REPORT_DIR}/product_performance_${DATE}.csv' WITH (HEADER 1);
EOF

Data Quality Check

-- data_quality.sql
-- Perform data quality checks
 
-- Check for nulls
SELECT 
    column_name,
    COUNT(*) as null_count
FROM (
    SELECT * FROM information_schema.columns
    WHERE table_name = 'target_table'
) as cols
CROSS JOIN (
    SELECT * FROM target_table
) as data
WHERE data[cols.column_name] IS NULL
GROUP BY column_name
HAVING null_count > 0;
 
-- Check for duplicates
SELECT 
    id,
    COUNT(*) as duplicate_count
FROM target_table
GROUP BY id
HAVING COUNT(*) > 1;
 
-- Value range checks
SELECT 
    MIN(value) as min_value,
    MAX(value) as max_value,
    AVG(value) as avg_value,
    STDDEV(value) as stddev_value
FROM numeric_table
WHERE value IS NOT NULL;

Remember:

  • Use appropriate data types
  • Create indexes for large tables
  • Leverage parallel processing
  • Monitor memory usage
  • Regular backups
  • Document queries and transformations

For detailed information, consult the DuckDB documentation (https://duckdb.org/docs/).