fileframe

package module

v0.3.0 Latest Latest Go to latest Published: Dec 11, 2025 License: MIT Imports: 12 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/nao1215/fileframe

Links

Open Source Insights

README ¶

Coverage

fileframe

fileframe is a lightweight, immutable DataFrame library for Go. It provides intuitive data manipulation without the complexity of pandas-like APIs.

Features

Immutable operations - All methods return new DataFrames, never modify the original
Multiple file formats - CSV, TSV, LTSV, Parquet, XLSX
Compression support - gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4

Installation

go get github.com/nao1215/fileframe

Quick Start

1. Load Data

// From file (auto-detects format and compression)
df, err := fileframe.NewDataFrameFromPath("data.csv")
df, err := fileframe.NewDataFrameFromPath("data.csv.gz")  // compressed
df, err := fileframe.NewDataFrameFromPath("data.parquet") // Parquet

// From slice of maps
df := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"name": "Alice", "age": 30, "city": "Tokyo"},
    {"name": "Bob", "age": 25, "city": "Osaka"},
})

// From io.Reader
df, err := fileframe.NewDataFrame(reader, fileframe.CSV)

2. Transform Data

// All operations return NEW DataFrames (immutable)
result := df.
    Filter(func(row map[string]any) bool {
        age, _ := row["age"].(int64)
        return age >= 20
    }).
    Mutate("adult", func(row map[string]any) any {
        return true
    }).
    Select("name", "age", "adult")

3. Aggregate Data

grouped, err := df.GroupBy("city")
if err != nil {
    log.Fatal(err)
}

totals, err := grouped.Sum("sales")    // Returns (*DataFrame, error)
averages, err := grouped.Mean("price") // Returns (*DataFrame, error)
counts := grouped.Count()              // Returns *DataFrame

4. Export Data

err := df.ToCSV("output.csv")
err := df.ToTSV("output.tsv")
records := df.ToRecords() // []map[string]any

Core Concepts

Immutability

Every operation returns a new DataFrame. The original is never modified.

original := fileframe.NewDataFrameFromRecords(data)
filtered := original.Filter(fn)  // original is unchanged
mutated := filtered.Mutate(...)  // filtered is unchanged

Method Chaining

Chain multiple operations for clean, readable code:

result := df.
    FillNA(0).
    Filter(filterFn).
    Mutate("new_col", mutateFn).
    Select("col1", "col2", "new_col").
    Head(100)

Error Handling

Operations that can fail return (*DataFrame, error):

// These return errors
grouped, err := df.GroupBy("category")
sorted, err := df.Sort("price", fileframe.Descending)
joined, err := df.Join(other, opt)

// These never fail (return *DataFrame directly)
filtered := df.Filter(fn)
selected := df.Select("col1", "col2")
head := df.Head(10)

Common Operations

Filtering & Selection

// Filter rows
adults := df.Filter(func(row map[string]any) bool {
    age, ok := row["age"].(int64)
    return ok && age >= 18
})

// Select columns
subset := df.Select("name", "email", "phone")

// Drop columns
cleaned := df.Drop("internal_id", "debug_flag")

Adding & Modifying Columns

// Add new column
withTotal := df.Mutate("total", func(row map[string]any) any {
    qty, _ := row["quantity"].(int64)
    price, _ := row["price"].(float64)
    return float64(qty) * price
})

// Rename columns
renamed, err := df.Rename("old_name", "new_name")
renamed, err := df.RenameColumns(map[string]string{
    "col1": "column_one",
    "col2": "column_two",
})

Sorting & Deduplication

// Sort by single column
sorted, err := df.Sort("price", fileframe.Descending)

// Sort by multiple columns
sorted, err := df.SortBy(
    fileframe.SortOption{Column: "category", Order: fileframe.Ascending},
    fileframe.SortOption{Column: "price", Order: fileframe.Descending},
)

// Remove duplicates
unique := df.Distinct()
unique := df.DistinctBy("email") // by specific column

Row Selection

first10 := df.Head(10)
last5 := df.Tail(5)
limited := df.Limit(100) // alias for Head

Missing Values

// Remove rows with nil
cleaned := df.DropNA()
cleaned := df.DropNASubset("required_field")

// Fill nil values
filled := df.FillNA(0)
filled := df.FillNAByColumn(map[string]any{
    "name":   "Unknown",
    "age":    0,
    "active": false,
})

Joining DataFrames

users := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"id": 1, "name": "Alice"},
    {"id": 2, "name": "Bob"},
})

orders := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"user_id": 1, "product": "Laptop"},
    {"user_id": 1, "product": "Mouse"},
})

// Join types: InnerJoin, LeftJoin, RightJoin, OuterJoin
result, err := users.Join(orders, fileframe.JoinOption{
    On:  []string{"id", "user_id"}, // left column, right column
    How: fileframe.LeftJoin,
})

Concatenating DataFrames

// Same schema
combined, err := df1.Concat(df2, df3)

// Different schemas (union of columns, nil for missing)
combined, err := fileframe.ConcatAll(df1, df2, df3)

GroupBy & Aggregation

grouped, err := df.GroupBy("category")
if err != nil {
    log.Fatal(err)
}

// Built-in aggregations
counts := grouped.Count()              // *DataFrame
sums, err := grouped.Sum("amount")     // (*DataFrame, error)
means, err := grouped.Mean("price")    // (*DataFrame, error)
mins, err := grouped.Min("value")      // (*DataFrame, error)
maxs, err := grouped.Max("value")      // (*DataFrame, error)

// Custom aggregation
median, err := grouped.Agg("value", func(values []any) any {
    // Your aggregation logic here
    return computeMedian(values)
})

// Global aggregation (no grouping)
globalGrouped, _ := df.GroupBy()
totalSum, _ := globalGrouped.Sum("amount")

Complete Example

package main

import (
    "fmt"
    "log"

    "github.com/nao1215/fileframe"
)

func main() {
    // Load sales data
    df, err := fileframe.NewDataFrameFromPath("sales.csv")
    if err != nil {
        log.Fatal(err)
    }

    // Process data
    result := df.
        FillNAByColumn(map[string]any{"salesperson": "Unknown"}).
        Filter(func(row map[string]any) bool {
            amount, _ := row["amount"].(int64)
            return amount > 0
        }).
        Mutate("revenue", func(row map[string]any) any {
            qty, _ := row["quantity"].(int64)
            price, _ := row["price"].(int64)
            return qty * price
        })

    // Aggregate by region
    grouped, err := result.GroupBy("region")
    if err != nil {
        log.Fatal(err)
    }

    byRegion, err := grouped.Sum("revenue")
    if err != nil {
        log.Fatal(err)
    }

    // Sort and get top 3
    sorted, err := byRegion.Sort("sum_revenue", fileframe.Descending)
    if err != nil {
        log.Fatal(err)
    }
    top3 := sorted.Head(3)

    // Output results
    for _, row := range top3.ToRecords() {
        fmt.Printf("%s: %v\n", row["region"], row["sum_revenue"])
    }

    // Export
    if err := top3.ToCSV("top_regions.csv"); err != nil {
        log.Fatal(err)
    }
}

Supported File Formats

Format	Read	Write	Compression
CSV	Yes	Yes	gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4
TSV	Yes	Yes	gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4
LTSV	Yes	-	gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4
Parquet	Yes	-	gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4
XLSX	Yes	-	gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4

Supported Compression Formats

Format	Extension	Library	Notes
gzip	`.gz`	compress/gzip	Standard library
bzip2	`.bz2`	compress/bzip2	Standard library
xz	`.xz`	github.com/ulikunitz/xz	Pure Go
zstd	`.zst`	github.com/klauspost/compress/zstd	Pure Go, high performance
zlib	`.z`	compress/zlib	Standard library
snappy	`.snappy`	github.com/klauspost/compress/snappy	Pure Go, high performance
s2	`.s2`	github.com/klauspost/compress/s2	Snappy-compatible, faster
lz4	`.lz4`	github.com/pierrec/lz4/v4	Pure Go

Performance

Benchmarks on AMD RYZEN AI MAX+ 395:

Operation	100 rows	1,000 rows	10,000 rows
CSV Parse	140 µs	1.4 ms	5.1 ms
Filter	27 µs	304 µs	2.9 ms
Select	13 µs	110 µs	1.5 ms
Mutate	37 µs	332 µs	4.5 ms
GroupBy + Sum	7 µs	57 µs	635 µs

Memory usage (10,000 rows):

CSV Parse: 8.4 MB
Filter: 5.2 MB
GroupBy + Sum: 408 KB

When to Use fileframe

Use fileframe when:

Working with small to medium datasets (< 100,000 rows)
Need simple, readable data transformations
Want immutable operations for predictable code
Working with multiple file formats

Consider alternatives when:

Processing very large files (use filesql for streaming)
Need complex SQL-like queries (use filesql)
Require lazy evaluation

nao1215/filesql - SQL driver for CSV, TSV, LTSV, Parquet, XLSX with streaming support
nao1215/fileprep - Struct-tag preprocessing and validation

Contributing

Contributions are welcome! Please see the Contributing Guide for details.

Support

If you find this project useful:

Give it a star on GitHub
Become a sponsor

License

MIT License - see LICENSE for details.

Documentation ¶

Overview ¶

Package fileframe provides a lightweight table utility that bridges fileprep and filesql.

fileframe is not a degraded copy of Pandas, but a practical tabular data manipulation tool that is idiomatic to Go. It follows the UNIX philosophy of doing one thing well.

Design Philosophy ¶

Small: Do one thing well (UNIX philosophy)
Practical: Only features used in real data analysis
Simple and clear: API is self-explanatory
Intuitive: Natural Go-like coding style
Extensible: Complex features delegated to filesql

Basic Usage ¶

// Create DataFrame from CSV
f, _ := os.Open("sales.csv")
defer f.Close()

df, err := fileframe.NewDataFrame(f, fileframe.CSV)
if err != nil {
    log.Fatal(err)
}

// Select columns and filter rows
result := df.
    Select("product", "amount", "category").
    Filter(func(row map[string]any) bool {
        amount, ok := row["amount"].(float64)
        return ok && amount > 1000
    })

// Group by and aggregate
grouped := result.
    GroupBy("category").
    Sum("amount")

// Output to CSV
grouped.ToCSV("summary.csv")

Architecture ¶

fileframe sits between fileprep (preprocessing) and filesql (persistence/SQL):

Receives output from fileprep (io.Reader -> DataFrame)
Performs basic transformations (Select, Filter, Mutate, GroupBy)
Outputs to CSV or passes to filesql

For complex operations like Window functions, subqueries, or large-scale data processing, use filesql directly.

Important Notes ¶

All operations execute immediately (no lazy evaluation)
Target scale: Small to medium data (under 100,000 rows)
Row-oriented design with []map[string]any
All methods return new DataFrames (immutable operations)

Example ¶

Example demonstrates basic DataFrame operations: reading CSV, filtering rows, grouping, and aggregating data.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Sample sales data
	csvData := `product,amount,category
Apple,100,Fruit
Banana,150,Fruit
Carrot,80,Vegetable
Orange,120,Fruit
Broccoli,90,Vegetable`

	// Create DataFrame from CSV
	df, err := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV)
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	fmt.Printf("Total rows: %d\n", df.Len())
	fmt.Printf("Columns: %v\n", df.Columns())

	// Filter: only items with amount > 100
	filtered := df.Filter(func(row map[string]any) bool {
		amount, ok := row["amount"].(int64)
		return ok && amount > 100
	})
	fmt.Printf("Rows with amount > 100: %d\n", filtered.Len())

	// GroupBy category and sum amounts
	groupedDf, err := df.GroupBy("category")
	if err != nil {
		fmt.Println("Error:", err)
		return
	}
	grouped, err := groupedDf.Sum("amount")
	if err != nil {
		fmt.Println("Error:", err)
		return
	}
	fmt.Printf("Grouped columns: %v\n", grouped.Columns())

	// Show grouped results
	for _, row := range grouped.ToRecords() {
		fmt.Printf("  %s: %.0f\n", row["category"], row["sum_amount"])
	}

}

Output:
Total rows: 5
Columns: [product amount category]
Rows with amount > 100: 2
Grouped columns: [category sum_amount]
  Fruit: 370
  Vegetable: 170

Example (ComplexOperations) ¶

Example_complexOperations demonstrates advanced DataFrame operations including multiple aggregations, data transformation with Mutate, and combining results from different DataFrames.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Sales data
	salesCSV := `order_id,product_id,quantity,unit_price
1,P001,2,100
2,P002,1,200
3,P001,3,100
4,P003,5,50
5,P002,2,200`

	// Product master data
	productsCSV := `product_id,name,category
P001,Laptop Stand,Electronics
P002,Mechanical Keyboard,Electronics
P003,Notebook,Stationery`

	// Create DataFrames
	sales, _ := fileframe.NewDataFrame(strings.NewReader(salesCSV), fileframe.CSV)       //nolint:errcheck // example code
	products, _ := fileframe.NewDataFrame(strings.NewReader(productsCSV), fileframe.CSV) //nolint:errcheck // example code

	// Add calculated column: total_amount = quantity * unit_price
	salesWithTotal := sales.Mutate("total_amount", func(row map[string]any) any {
		qty, _ := row["quantity"].(int64)     //nolint:errcheck // example code
		price, _ := row["unit_price"].(int64) //nolint:errcheck // example code
		return qty * price
	})

	fmt.Println("=== Sales with Total Amount ===")
	for _, row := range salesWithTotal.ToRecords() {
		fmt.Printf("Order %v: %v x %v = %v\n",
			row["order_id"], row["quantity"], row["unit_price"], row["total_amount"])
	}

	// Aggregate sales by product_id
	salesByProductGrp, _ := salesWithTotal.GroupBy("product_id") //nolint:errcheck // example code
	salesByProduct, _ := salesByProductGrp.Sum("total_amount")   //nolint:errcheck // example code

	fmt.Println("\n=== Sales by Product ===")
	for _, row := range salesByProduct.ToRecords() {
		fmt.Printf("%s: %.0f\n", row["product_id"], row["sum_total_amount"])
	}

	// Create a lookup map from products DataFrame
	productLookup := make(map[string]map[string]any)
	for _, row := range products.ToRecords() {
		pid, _ := row["product_id"].(string) //nolint:errcheck // example code
		productLookup[pid] = row
	}

	// Combine sales summary with product info (manual join)
	combinedRecords := make([]map[string]any, 0)
	for _, salesRow := range salesByProduct.ToRecords() {
		pid, _ := salesRow["product_id"].(string) //nolint:errcheck // example code
		if productInfo, exists := productLookup[pid]; exists {
			combined := map[string]any{
				"product_id":  pid,
				"name":        productInfo["name"],
				"category":    productInfo["category"],
				"total_sales": salesRow["sum_total_amount"],
			}
			combinedRecords = append(combinedRecords, combined)
		}
	}
	combined := fileframe.NewDataFrameFromRecords(combinedRecords)

	fmt.Println("\n=== Combined Sales Report ===")
	fmt.Printf("Columns: %v\n", combined.Columns())
	for _, row := range combined.ToRecords() {
		fmt.Printf("%s (%s): %.0f\n",
			row["name"], row["category"], row["total_sales"])
	}

	// Group combined data by category
	byCategoryGrp, _ := combined.GroupBy("category")  //nolint:errcheck // example code
	byCategory, _ := byCategoryGrp.Sum("total_sales") //nolint:errcheck // example code

	fmt.Println("\n=== Total Sales by Category ===")
	for _, row := range byCategory.ToRecords() {
		fmt.Printf("%s: %.0f\n", row["category"], row["sum_total_sales"])
	}

	// Calculate statistics
	fmt.Println("\n=== Sales Statistics ===")
	statsGrp, _ := salesWithTotal.GroupBy()  //nolint:errcheck // example code
	stats, _ := statsGrp.Sum("total_amount") //nolint:errcheck // example code
	for _, row := range stats.ToRecords() {
		fmt.Printf("Total Revenue: %.0f\n", row["sum_total_amount"])
	}

	meanSalesGrp, _ := salesWithTotal.GroupBy()       //nolint:errcheck // example code
	meanSales, _ := meanSalesGrp.Mean("total_amount") //nolint:errcheck // example code
	for _, row := range meanSales.ToRecords() {
		fmt.Printf("Average Order Value: %.0f\n", row["mean_total_amount"])
	}

	minSalesGrp, _ := salesWithTotal.GroupBy()     //nolint:errcheck // example code
	minSales, _ := minSalesGrp.Min("total_amount") //nolint:errcheck // example code
	for _, row := range minSales.ToRecords() {
		fmt.Printf("Min Order: %.0f\n", row["min_total_amount"])
	}

	maxSalesGrp, _ := salesWithTotal.GroupBy()     //nolint:errcheck // example code
	maxSales, _ := maxSalesGrp.Max("total_amount") //nolint:errcheck // example code
	for _, row := range maxSales.ToRecords() {
		fmt.Printf("Max Order: %.0f\n", row["max_total_amount"])
	}

}

Output:
=== Sales with Total Amount ===
Order 1: 2 x 100 = 200
Order 2: 1 x 200 = 200
Order 3: 3 x 100 = 300
Order 4: 5 x 50 = 250
Order 5: 2 x 200 = 400

=== Sales by Product ===
P001: 500
P002: 600
P003: 250

=== Combined Sales Report ===
Columns: [category name product_id total_sales]
Laptop Stand (Electronics): 500
Mechanical Keyboard (Electronics): 600
Notebook (Stationery): 250

=== Total Sales by Category ===
Electronics: 1100
Stationery: 250

=== Sales Statistics ===
Total Revenue: 1350
Average Order Value: 270
Min Order: 200
Max Order: 400

Example (Concat) ¶

Example_concat demonstrates vertical concatenation of DataFrames with the same schema. Use Concat when combining data from multiple sources with identical columns.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Sales data from different regions (same schema)
	tokyoCSV := `region,product,sales
Tokyo,Laptop,100
Tokyo,Mouse,300`

	osakaCSV := `region,product,sales
Osaka,Laptop,80
Osaka,Mouse,250
Osaka,Keyboard,120`

	tokyo, _ := fileframe.NewDataFrame(strings.NewReader(tokyoCSV), fileframe.CSV) //nolint:errcheck
	osaka, _ := fileframe.NewDataFrame(strings.NewReader(osakaCSV), fileframe.CSV) //nolint:errcheck

	// Concat requires identical columns
	combined, err := tokyo.Concat(osaka)
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	fmt.Printf("Tokyo rows: %d\n", tokyo.Len())
	fmt.Printf("Osaka rows: %d\n", osaka.Len())
	fmt.Printf("Combined rows: %d\n", combined.Len())

	// Now we can analyze the combined data
	grouped, _ := combined.GroupBy("product") //nolint:errcheck
	totals, _ := grouped.Sum("sales")         //nolint:errcheck

	// Sort by product name for deterministic output
	sortedTotals, _ := totals.Sort("product", fileframe.Ascending) //nolint:errcheck

	fmt.Println("\nTotal sales by product:")
	for _, row := range sortedTotals.ToRecords() {
		fmt.Printf("  %s: %.0f\n", row["product"], row["sum_sales"])
	}

}

Output:
Tokyo rows: 2
Osaka rows: 3
Combined rows: 5

Total sales by product:
  Keyboard: 120
  Laptop: 180
  Mouse: 550

Example (ConcatAll) ¶

Example_concatAll demonstrates flexible concatenation of DataFrames with different schemas. ConcatAll automatically handles different column sets by creating a union of all columns.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Data from 2023 - basic schema
	data2023CSV := `year,product,sales
2023,Laptop,1000
2023,Mouse,500`

	// Data from 2024 - added "region" column
	data2024CSV := `year,product,sales,region
2024,Laptop,1200,Tokyo
2024,Mouse,600,Osaka
2024,Keyboard,300,Tokyo`

	df2023, _ := fileframe.NewDataFrame(strings.NewReader(data2023CSV), fileframe.CSV) //nolint:errcheck
	df2024, _ := fileframe.NewDataFrame(strings.NewReader(data2024CSV), fileframe.CSV) //nolint:errcheck

	fmt.Printf("2023 columns: %v\n", df2023.Columns())
	fmt.Printf("2024 columns: %v\n", df2024.Columns())

	// ConcatAll handles different schemas
	combined, err := fileframe.ConcatAll(df2023, df2024)
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	fmt.Printf("Combined columns: %v\n", combined.Columns())
	fmt.Printf("Combined rows: %d\n", combined.Len())

	// 2023 data will have nil for "region"
	fmt.Println("\nCombined data:")
	for _, row := range combined.ToRecords() {
		region := row["region"]
		if region == nil {
			region = "(no region)"
		}
		// Handle int64 vs float64 for sales column
		var sales float64
		switch v := row["sales"].(type) {
		case int64:
			sales = float64(v)
		case float64:
			sales = v
		}
		fmt.Printf("  %v %s: %.0f - %v\n",
			row["year"], row["product"], sales, region)
	}

}

Output:
2023 columns: [year product sales]
2024 columns: [year product sales region]
Combined columns: [product region sales year]
Combined rows: 5

Combined data:
  2023 Laptop: 1000 - (no region)
  2023 Mouse: 500 - (no region)
  2024 Laptop: 1200 - Tokyo
  2024 Mouse: 600 - Osaka
  2024 Keyboard: 300 - Tokyo

Example (CustomAggregation) ¶

Example_customAggregation demonstrates how to use the Agg function to implement custom aggregation logic such as median calculation.

package main

import (
	"fmt"
	"slices"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	csvData := `category,value
A,10
A,20
A,30
A,40
A,50
B,5
B,15
B,25`

	df, err := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV)
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	grouped, err := df.GroupBy("category")
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	// Custom aggregation: calculate median
	median := func(values []any) any {
		// Filter and convert to float64
		nums := make([]float64, 0, len(values))
		for _, v := range values {
			switch n := v.(type) {
			case int64:
				nums = append(nums, float64(n))
			case float64:
				nums = append(nums, n)
			}
		}
		if len(nums) == 0 {
			return nil
		}

		// Sort values
		slices.Sort(nums)

		// Calculate median
		mid := len(nums) / 2
		if len(nums)%2 == 0 {
			return (nums[mid-1] + nums[mid]) / 2
		}
		return nums[mid]
	}

	result, _ := grouped.Agg("value", median) //nolint:errcheck // example code

	fmt.Println("Median by category:")
	for _, row := range result.ToRecords() {
		fmt.Printf("  %s: %.1f\n", row["category"], row["agg_value"])
	}

	// Custom aggregation: calculate range (max - min)
	rangeFunc := func(values []any) any {
		var minVal, maxVal float64
		first := true
		for _, v := range values {
			var n float64
			switch val := v.(type) {
			case int64:
				n = float64(val)
			case float64:
				n = val
			default:
				continue
			}
			if first {
				minVal, maxVal = n, n
				first = false
			} else {
				if n < minVal {
					minVal = n
				}
				if n > maxVal {
					maxVal = n
				}
			}
		}
		if first {
			return nil
		}
		return maxVal - minVal
	}

	rangeResult, _ := grouped.Agg("value", rangeFunc) //nolint:errcheck // example code

	fmt.Println("Range by category:")
	for _, row := range rangeResult.ToRecords() {
		fmt.Printf("  %s: %.0f\n", row["category"], row["agg_value"])
	}

}

Output:
Median by category:
  A: 30.0
  B: 15.0
Range by category:
  A: 40
  B: 20

Example (DataframePipeline) ¶

Example_dataframePipeline demonstrates chaining multiple DataFrame operations to build a complete data processing pipeline.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Raw sales data with some issues
	salesCSV := `date,region,product,quantity,price,salesperson
2024-01-15,Tokyo,Laptop,2,1000,Alice
2024-01-15,Tokyo,Mouse,10,25,Alice
2024-01-16,Osaka,Laptop,1,1000,Bob
2024-01-16,Osaka,Keyboard,5,75,
2024-01-17,Tokyo,Monitor,3,300,Charlie
2024-01-17,Nagoya,Mouse,8,25,Diana`

	df, _ := fileframe.NewDataFrame(strings.NewReader(salesCSV), fileframe.CSV) //nolint:errcheck

	// Pipeline: Clean -> Transform -> Aggregate -> Sort -> Limit
	result := df.
		// 1. Fill missing salesperson
		FillNAByColumn(map[string]any{"salesperson": "Unknown"}).
		// 2. Add calculated column
		Mutate("revenue", func(row map[string]any) any {
			qty, _ := row["quantity"].(int64) //nolint:errcheck
			price, _ := row["price"].(int64)  //nolint:errcheck
			return float64(qty) * float64(price)
		}).
		// 3. Select relevant columns
		Select("region", "product", "revenue", "salesperson")

	// Group by region and sum revenue
	grouped, _ := result.GroupBy("region") //nolint:errcheck
	byRegion, _ := grouped.Sum("revenue")  //nolint:errcheck

	// Sort by revenue descending
	sorted, _ := byRegion.Sort("sum_revenue", fileframe.Descending) //nolint:errcheck

	fmt.Println("Revenue by Region (Top to Bottom):")
	for _, row := range sorted.ToRecords() {
		fmt.Printf("  %s: $%.0f\n", row["region"], row["sum_revenue"])
	}

	// Also get top 3 individual sales
	topSales, _ := result.Sort("revenue", fileframe.Descending) //nolint:errcheck
	fmt.Println("\nTop 3 Sales:")
	for _, row := range topSales.Head(3).ToRecords() {
		fmt.Printf("  %s in %s: $%.0f (by %s)\n",
			row["product"], row["region"], row["revenue"], row["salesperson"])
	}

}

Output:
Revenue by Region (Top to Bottom):
  Tokyo: $3150
  Osaka: $1375
  Nagoya: $200

Top 3 Sales:
  Laptop in Tokyo: $2000 (by Alice)
  Laptop in Osaka: $1000 (by Bob)
  Monitor in Tokyo: $900 (by Charlie)

Example (DropRename) ¶

Example_dropRename demonstrates column manipulation operations.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	csvData := `user_id,first_name,last_name,internal_code,email
1,Alice,Smith,X123,alice@example.com
2,Bob,Jones,X456,bob@example.com`

	df, _ := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV) //nolint:errcheck

	fmt.Printf("Original columns: %v\n", df.Columns())

	// Drop internal column
	cleaned := df.Drop("internal_code")
	fmt.Printf("After Drop: %v\n", cleaned.Columns())

	// Rename columns for clarity
	renamed, _ := cleaned.RenameColumns(map[string]string{ //nolint:errcheck
		"first_name": "first",
		"last_name":  "last",
	})
	fmt.Printf("After Rename: %v\n", renamed.Columns())

	// Single column rename
	final, _ := renamed.Rename("user_id", "id") //nolint:errcheck
	fmt.Printf("Final columns: %v\n", final.Columns())

}

Output:
Original columns: [user_id first_name last_name internal_code email]
After Drop: [user_id first_name last_name email]
After Rename: [user_id first last email]
Final columns: [id first last email]

Example (FileFormats) ¶

Example_fileFormats shows the various file formats supported by fileframe. NewDataFrameFromPath automatically detects file type and handles compression.

Supported formats:

CSV, TSV, LTSV, XLSX, Parquet
Compressed variants: .gz, .bz2, .xz, .zst, .z, .snappy, .s2, .lz4

Usage:

// Auto-detect CSV
df, err := fileframe.NewDataFrameFromPath("data.csv")

// Auto-detect compressed CSV (gzip)
df, err := fileframe.NewDataFrameFromPath("data.csv.gz")

// Auto-detect TSV with zstd compression
df, err := fileframe.NewDataFrameFromPath("data.tsv.zst")

// Auto-detect CSV with snappy compression
df, err := fileframe.NewDataFrameFromPath("data.csv.snappy")

// Auto-detect TSV with lz4 compression
df, err := fileframe.NewDataFrameFromPath("data.tsv.lz4")

// Auto-detect Excel file
df, err := fileframe.NewDataFrameFromPath("spreadsheet.xlsx")

// Auto-detect Parquet file
df, err := fileframe.NewDataFrameFromPath("data.parquet")

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// This example demonstrates the API for reading various file formats.
	// Since Example functions require deterministic output, we show
	// equivalent operations using NewDataFrame with explicit file types.

	// TSV (Tab-Separated Values)
	tsvData := "name\tage\tcity\nAlice\t30\tTokyo\nBob\t25\tOsaka"
	dfTSV, _ := fileframe.NewDataFrame(strings.NewReader(tsvData), fileframe.TSV) //nolint:errcheck
	fmt.Printf("TSV columns: %v, rows: %d\n", dfTSV.Columns(), dfTSV.Len())

	// LTSV (Labeled Tab-Separated Values)
	ltsvData := "name:Alice\tage:30\tcity:Tokyo\nname:Bob\tage:25\tcity:Osaka"
	dfLTSV, _ := fileframe.NewDataFrame(strings.NewReader(ltsvData), fileframe.LTSV) //nolint:errcheck
	fmt.Printf("LTSV columns: %v, rows: %d\n", dfLTSV.Columns(), dfLTSV.Len())

	// For file-based operations with compression, use NewDataFrameFromPath:
	//
	//   df, err := fileframe.NewDataFrameFromPath("logs.csv.gz")      // gzip
	//   df, err := fileframe.NewDataFrameFromPath("data.tsv.bz2")     // bzip2
	//   df, err := fileframe.NewDataFrameFromPath("export.csv.xz")    // xz
	//   df, err := fileframe.NewDataFrameFromPath("archive.csv.zst")  // zstd
	//   df, err := fileframe.NewDataFrameFromPath("data.csv.z")       // zlib
	//   df, err := fileframe.NewDataFrameFromPath("data.csv.snappy")  // snappy
	//   df, err := fileframe.NewDataFrameFromPath("data.csv.s2")      // s2
	//   df, err := fileframe.NewDataFrameFromPath("data.csv.lz4")     // lz4

}

Output:
TSV columns: [name age city], rows: 2
LTSV columns: [name age city], rows: 2

Example (GlobalAggregation) ¶

Example_globalAggregation demonstrates how to calculate global statistics without grouping by any column. Call GroupBy() with no arguments to aggregate the entire DataFrame into a single result.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	csvData := `product,price,quantity
Laptop,1000,5
Mouse,25,50
Keyboard,75,30
Monitor,300,10`

	df, err := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV)
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	// GroupBy() with no arguments = global aggregation (entire DataFrame as one group)
	grouped, err := df.GroupBy()
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	// Calculate various statistics for the entire dataset
	count := grouped.Count()
	fmt.Printf("Total products: %d\n", count.ToRecords()[0]["count"])

	sumResult, _ := grouped.Sum("price") //nolint:errcheck // example code
	fmt.Printf("Sum of prices: %.0f\n", sumResult.ToRecords()[0]["sum_price"])

	meanResult, _ := grouped.Mean("quantity") //nolint:errcheck // example code
	fmt.Printf("Average quantity: %.2f\n", meanResult.ToRecords()[0]["mean_quantity"])

	minResult, _ := grouped.Min("price") //nolint:errcheck // example code
	fmt.Printf("Min price: %.0f\n", minResult.ToRecords()[0]["min_price"])

	maxResult, _ := grouped.Max("price") //nolint:errcheck // example code
	fmt.Printf("Max price: %.0f\n", maxResult.ToRecords()[0]["max_price"])

}

Output:
Total products: 4
Sum of prices: 1400
Average quantity: 23.75
Min price: 25
Max price: 1000

Example (HandleMissingValues) ¶

Example_handleMissingValues demonstrates DropNA and FillNA operations.

package main

import (
	"fmt"

	"github.com/nao1215/fileframe"
)

func main() {
	// Create DataFrame with nil values
	records := []map[string]any{
		{"name": "Alice", "age": int64(30), "city": "Tokyo"},
		{"name": "Bob", "age": nil, "city": "Osaka"},
		{"name": nil, "age": int64(25), "city": nil},
		{"name": "Diana", "age": int64(35), "city": "Kyoto"},
	}
	df := fileframe.NewDataFrameFromRecords(records)

	fmt.Printf("Original rows: %d\n", df.Len())

	// DropNA: Remove rows with any nil values
	cleaned := df.DropNA()
	fmt.Printf("After DropNA: %d rows\n", cleaned.Len())

	// DropNASubset: Remove rows with nil only in specific columns
	partialClean := df.DropNASubset("name")
	fmt.Printf("After DropNASubset(name): %d rows\n", partialClean.Len())

	// FillNA: Replace all nil values with a default
	filled := df.FillNA("Unknown")
	fmt.Println("\nAfter FillNA('Unknown'):")
	for _, row := range filled.ToRecords() {
		fmt.Printf("  %v, %v, %v\n", row["name"], row["age"], row["city"])
	}

	// FillNAByColumn: Different defaults per column
	smartFilled := df.FillNAByColumn(map[string]any{
		"name": "Anonymous",
		"age":  int64(0),
		"city": "Unknown",
	})
	fmt.Println("\nAfter FillNAByColumn:")
	for _, row := range smartFilled.ToRecords() {
		fmt.Printf("  %v, %v, %v\n", row["name"], row["age"], row["city"])
	}

}

Output:
Original rows: 4
After DropNA: 2 rows
After DropNASubset(name): 3 rows

After FillNA('Unknown'):
  Alice, 30, Tokyo
  Bob, Unknown, Osaka
  Unknown, 25, Unknown
  Diana, 35, Kyoto

After FillNAByColumn:
  Alice, 30, Tokyo
  Bob, 0, Osaka
  Anonymous, 25, Unknown
  Diana, 35, Kyoto

Example (HeadTailLimit) ¶

Example_headTailLimit demonstrates row selection operations.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	csvData := `id,value
1,100
2,200
3,300
4,400
5,500
6,600
7,700`

	df, _ := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV) //nolint:errcheck

	fmt.Printf("Total rows: %d\n", df.Len())

	// Get first 3 rows
	head := df.Head(3)
	fmt.Printf("\nHead(3) - first 3 rows:\n")
	for _, row := range head.ToRecords() {
		fmt.Printf("  id=%v, value=%v\n", row["id"], row["value"])
	}

	// Get last 2 rows
	tail := df.Tail(2)
	fmt.Printf("\nTail(2) - last 2 rows:\n")
	for _, row := range tail.ToRecords() {
		fmt.Printf("  id=%v, value=%v\n", row["id"], row["value"])
	}

	// Limit is alias for Head - useful for SQL-like syntax
	limited := df.Limit(2)
	fmt.Printf("\nLimit(2) rows: %d\n", limited.Len())

}

Output:
Total rows: 7

Head(3) - first 3 rows:
  id=1, value=100
  id=2, value=200
  id=3, value=300

Tail(2) - last 2 rows:
  id=6, value=600
  id=7, value=700

Limit(2) rows: 2

Example (Join) ¶

Example_join demonstrates how to combine two DataFrames using Join. This is similar to SQL JOIN operations and supports inner, left, right, and outer joins.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Users table
	usersCSV := `id,name,department
1,Alice,Engineering
2,Bob,Marketing
3,Charlie,Engineering
4,Diana,Sales`

	// Orders table - note that user_id references users.id
	ordersCSV := `order_id,user_id,product,amount
101,1,Laptop,1200
102,1,Mouse,50
103,2,Monitor,400
104,5,Keyboard,100`

	users, _ := fileframe.NewDataFrame(strings.NewReader(usersCSV), fileframe.CSV)   //nolint:errcheck
	orders, _ := fileframe.NewDataFrame(strings.NewReader(ordersCSV), fileframe.CSV) //nolint:errcheck

	// Inner Join: Only users who have orders
	inner, _ := users.Join(orders, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"id", "user_id"}, // Left column, Right column
		How: fileframe.InnerJoin,
	})
	fmt.Println("=== Inner Join (users with orders) ===")
	fmt.Printf("Rows: %d\n", inner.Len())
	for _, row := range inner.ToRecords() {
		fmt.Printf("  %s ordered %s ($%v)\n", row["name"], row["product"], row["amount"])
	}

	// Left Join: All users, with order info if available
	left, _ := users.Join(orders, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"id", "user_id"},
		How: fileframe.LeftJoin,
	})
	fmt.Println("\n=== Left Join (all users) ===")
	fmt.Printf("Rows: %d\n", left.Len())

	// Count users without orders
	noOrders := 0
	for _, row := range left.ToRecords() {
		if row["order_id"] == nil {
			noOrders++
		}
	}
	fmt.Printf("Users without orders: %d\n", noOrders)

}

Output:
=== Inner Join (users with orders) ===
Rows: 3
  Alice ordered Laptop ($1200)
  Alice ordered Mouse ($50)
  Bob ordered Monitor ($400)

=== Left Join (all users) ===
Rows: 5
Users without orders: 2

Example (JoinTypes) ¶

Example_joinTypes demonstrates all four join types: InnerJoin, LeftJoin, RightJoin, and OuterJoin.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Left DataFrame: Products
	productsCSV := `product_id,name
P1,Laptop
P2,Mouse
P3,Keyboard`

	// Right DataFrame: Inventory
	inventoryCSV := `item_id,quantity,warehouse
P1,50,Tokyo
P2,200,Osaka
P4,30,Tokyo`

	products, _ := fileframe.NewDataFrame(strings.NewReader(productsCSV), fileframe.CSV)   //nolint:errcheck
	inventory, _ := fileframe.NewDataFrame(strings.NewReader(inventoryCSV), fileframe.CSV) //nolint:errcheck

	// Inner Join: Only products in inventory
	inner, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"product_id", "item_id"},
		How: fileframe.InnerJoin,
	})
	fmt.Printf("Inner Join: %d rows (products in inventory)\n", inner.Len())

	// Left Join: All products, with inventory if exists
	left, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"product_id", "item_id"},
		How: fileframe.LeftJoin,
	})
	fmt.Printf("Left Join: %d rows (all products)\n", left.Len())

	// Right Join: All inventory items, with product info if exists
	right, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"product_id", "item_id"},
		How: fileframe.RightJoin,
	})
	fmt.Printf("Right Join: %d rows (all inventory)\n", right.Len())

	// Outer Join: Everything from both
	outer, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"product_id", "item_id"},
		How: fileframe.OuterJoin,
	})
	fmt.Printf("Outer Join: %d rows (all products + all inventory)\n", outer.Len())

}

Output:
Inner Join: 2 rows (products in inventory)
Left Join: 3 rows (all products)
Right Join: 3 rows (all inventory)
Outer Join: 4 rows (all products + all inventory)

Example (SortAndDistinct) ¶

Example_sortAndDistinct demonstrates sorting and deduplication operations.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	csvData := `name,category,score
Alice,A,85
Bob,B,90
Charlie,A,85
Alice,A,85
Diana,B,75
Eve,A,95`

	df, _ := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV) //nolint:errcheck

	fmt.Printf("Original rows: %d\n", df.Len())

	// Remove duplicate rows
	unique := df.Distinct()
	fmt.Printf("After Distinct: %d rows\n", unique.Len())

	// Sort by score descending
	sorted, _ := unique.Sort("score", fileframe.Descending) //nolint:errcheck
	fmt.Println("\nTop scores:")
	for _, row := range sorted.Head(3).ToRecords() {
		fmt.Printf("  %s: %v\n", row["name"], row["score"])
	}

	// Sort by multiple columns: category ascending, then score descending
	multiSorted, _ := unique.SortBy( //nolint:errcheck
		fileframe.SortOption{Column: "category", Order: fileframe.Ascending},
		fileframe.SortOption{Column: "score", Order: fileframe.Descending},
	)
	fmt.Println("\nBy category, then score:")
	for _, row := range multiSorted.ToRecords() {
		fmt.Printf("  [%s] %s: %v\n", row["category"], row["name"], row["score"])
	}

}

Output:
Original rows: 6
After Distinct: 5 rows

Top scores:
  Eve: 95
  Bob: 90
  Alice: 85

By category, then score:
  [A] Eve: 95
  [A] Alice: 85
  [A] Charlie: 85
  [B] Bob: 90
  [B] Diana: 75

Examples ¶

Package
Package (ComplexOperations)
Package (Concat)
Package (ConcatAll)
Package (CustomAggregation)
Package (DataframePipeline)
Package (DropRename)
Package (FileFormats)
Package (GlobalAggregation)
Package (HandleMissingValues)
Package (HeadTailLimit)
Package (Join)
Package (JoinTypes)
Package (SortAndDistinct)

Constants ¶

View Source

const (
	// CSV represents CSV file type
	CSV = fileparser.CSV
	// TSV represents TSV file type
	TSV = fileparser.TSV
	// LTSV represents LTSV file type
	LTSV = fileparser.LTSV
	// Parquet represents Parquet file type
	Parquet = fileparser.Parquet
	// XLSX represents Excel XLSX file type
	XLSX = fileparser.XLSX

	// Compressed CSV variants
	CSVGZ   = fileparser.CSVGZ
	CSVBZ2  = fileparser.CSVBZ2
	CSVXZ   = fileparser.CSVXZ
	CSVZSTD = fileparser.CSVZSTD

	// Compressed TSV variants
	TSVGZ   = fileparser.TSVGZ
	TSVBZ2  = fileparser.TSVBZ2
	TSVXZ   = fileparser.TSVXZ
	TSVZSTD = fileparser.TSVZSTD

	// Compressed LTSV variants
	LTSVGZ   = fileparser.LTSVGZ
	LTSVBZ2  = fileparser.LTSVBZ2
	LTSVXZ   = fileparser.LTSVXZ
	LTSVZSTD = fileparser.LTSVZSTD

	// Compressed Parquet variants
	ParquetGZ   = fileparser.ParquetGZ
	ParquetBZ2  = fileparser.ParquetBZ2
	ParquetXZ   = fileparser.ParquetXZ
	ParquetZSTD = fileparser.ParquetZSTD

	// Compressed XLSX variants
	XLSXGZ   = fileparser.XLSXGZ
	XLSXBZ2  = fileparser.XLSXBZ2
	XLSXXZ   = fileparser.XLSXXZ
	XLSXZSTD = fileparser.XLSXZSTD

	// ZLIB compressed variants
	CSVZLIB     = fileparser.CSVZLIB
	TSVZLIB     = fileparser.TSVZLIB
	LTSVZLIB    = fileparser.LTSVZLIB
	ParquetZLIB = fileparser.ParquetZLIB
	XLSXZLIB    = fileparser.XLSXZLIB

	// Snappy compressed variants
	CSVSNAPPY     = fileparser.CSVSNAPPY
	TSVSNAPPY     = fileparser.TSVSNAPPY
	LTSVSNAPPY    = fileparser.LTSVSNAPPY
	ParquetSNAPPY = fileparser.ParquetSNAPPY
	XLSXSNAPPY    = fileparser.XLSXSNAPPY

	// S2 compressed variants
	CSVS2     = fileparser.CSVS2
	TSVS2     = fileparser.TSVS2
	LTSVS2    = fileparser.LTSVS2
	ParquetS2 = fileparser.ParquetS2
	XLSXS2    = fileparser.XLSXS2

	// LZ4 compressed variants
	CSVLZ4     = fileparser.CSVLZ4
	TSVLZ4     = fileparser.TSVLZ4
	LTSVLZ4    = fileparser.LTSVLZ4
	ParquetLZ4 = fileparser.ParquetLZ4
	XLSXLZ4    = fileparser.XLSXLZ4
)

Supported file types (re-exported from fileparser)

Variables ¶

View Source

var ErrColumnNotFound = errors.New("column not found")

ErrColumnNotFound is returned when a specified column does not exist in the DataFrame.

Functions ¶

This section is empty.

Types ¶

type AggFunc ¶

type AggFunc func(values []any) any

AggFunc is a function type for custom aggregation. It receives a slice of values from the same group and returns the aggregated result.

var AggCount AggFunc = func(values []any) any {
	count := 0
	for _, v := range values {
		if v != nil {
			count++
		}
	}
	return int64(count)
}

AggCount counts the number of non-nil values. Returns int64 count. Note: this counts all non-nil values, not just numeric ones.

var AggMax AggFunc = func(values []any) any {
	maxVal := -math.MaxFloat64
	found := false
	for _, v := range values {
		if f, ok := toFloat64(v); ok {
			if f > maxVal {
				maxVal = f
			}
			found = true
		}
	}
	if !found {
		return nil
	}
	return maxVal
}

AggMax finds the maximum numeric value. Non-numeric values (including nil, strings, bools) are silently ignored. Returns nil if no numeric values exist. Otherwise returns float64.

var AggMean AggFunc = func(values []any) any {
	sum := 0.0
	count := 0
	for _, v := range values {
		if f, ok := toFloat64(v); ok {
			sum += f
			count++
		}
	}
	if count == 0 {
		return nil
	}
	return sum / float64(count)
}

AggMean calculates the arithmetic mean of numeric values. Non-numeric values (including nil, strings, bools) are silently ignored. Returns nil if no numeric values exist. Otherwise returns float64.

var AggMin AggFunc = func(values []any) any {
	minVal := math.MaxFloat64
	found := false
	for _, v := range values {
		if f, ok := toFloat64(v); ok {
			if f < minVal {
				minVal = f
			}
			found = true
		}
	}
	if !found {
		return nil
	}
	return minVal
}

AggMin finds the minimum numeric value. Non-numeric values (including nil, strings, bools) are silently ignored. Returns nil if no numeric values exist. Otherwise returns float64.

var AggSum AggFunc = func(values []any) any {
	sum := 0.0
	for _, v := range values {
		if f, ok := toFloat64(v); ok {
			sum += f
		}
	}
	return sum
}

AggSum calculates the sum of numeric values. Non-numeric values (including nil, strings, bools) are silently ignored. Returns 0.0 if all values are non-numeric. Always returns float64.

type DataFrame ¶

type DataFrame struct {
	// contains filtered or unexported fields
}

DataFrame is a simple representation of tabular data. It stores data in row-oriented format with immediate execution (no lazy evaluation).

func ConcatAll ¶

func ConcatAll(frames ...*DataFrame) (*DataFrame, error)

ConcatAll concatenates multiple DataFrames vertically, automatically handling different column sets by taking the union of all columns. This is a standalone function (not a method) that accepts any number of DataFrames.

Column Handling:

Columns from all DataFrames are collected into a union set
Columns are sorted alphabetically for deterministic output
Missing values in rows are set to nil

Nil DataFrames are silently skipped, making this safe for optional data.

Use Cases:

Combining data from different sources with overlapping schemas
Merging datasets that evolved over time with different columns
Appending new data with additional fields to existing data

Example - Combining data with different schemas:

users := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"name": "Alice", "age": 30},
})
contacts := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"name": "Bob", "email": "bob@example.com"},
})
result, err := fileframe.ConcatAll(users, contacts)
// Result columns: ["age", "email", "name"] (sorted alphabetically)
// Alice has nil for email, Bob has nil for age

Example - Combining CSV and TSV data:

csv, _ := fileframe.NewDataFrameFromPath("users.csv")
tsv, _ := fileframe.NewDataFrameFromPath("extra_info.tsv")
combined, err := fileframe.ConcatAll(csv, tsv)

func NewDataFrame ¶

func NewDataFrame(reader io.Reader, fileType FileType) (*DataFrame, error)

NewDataFrame creates a DataFrame from an io.Reader. It supports CSV, TSV, LTSV, XLSX, and Parquet formats.

Example:

f, _ := os.Open("data.csv")
defer f.Close()
df, err := fileframe.NewDataFrame(f, fileframe.CSV)

func NewDataFrameFromPath ¶

func NewDataFrameFromPath(path string) (*DataFrame, error)

NewDataFrameFromPath creates a DataFrame from a file path. It automatically detects the file type and handles compressed files (gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4).

Supported formats: CSV, TSV, LTSV, XLSX, Parquet, and their compressed variants. For XLSX files with multiple sheets, the first sheet is used.

Example:

df, err := fileframe.NewDataFrameFromPath("data.csv.gz")
df, err := fileframe.NewDataFrameFromPath("data.csv.snappy")

func NewDataFrameFromRecords ¶

func NewDataFrameFromRecords(records []map[string]any) *DataFrame

NewDataFrameFromRecords creates a DataFrame from a slice of maps. Each map represents a row with column names as keys. Column order is determined by processing records in order, and within each record, keys are sorted alphabetically. New columns are appended as they are encountered.

Example:

records := []map[string]any{
    {"name": "Alice", "age": 30},
    {"name": "Bob", "age": 25},
}
df := fileframe.NewDataFrameFromRecords(records)

func (*DataFrame) Columns ¶

func (df *DataFrame) Columns() []string

Columns returns a copy of the column names.

func (*DataFrame) Concat ¶

func (df *DataFrame) Concat(others ...*DataFrame) (*DataFrame, error)

Concat concatenates multiple DataFrames vertically (row-wise). This is useful for combining data from multiple sources with the same schema.

Requirements:

All DataFrames must have exactly the same columns in the same order
If columns differ, use ConcatAll instead for flexible concatenation

Returns an error if:

Any DataFrame is nil
Columns don't match (different names or different order)

Example - Combining monthly data:

jan := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"month": "Jan", "sales": 100},
})
feb := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"month": "Feb", "sales": 150},
})
mar := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"month": "Mar", "sales": 200},
})
quarterly, err := jan.Concat(feb, mar)
// Result: 3 rows with all monthly data

Example - Combining data from multiple CSV files:

df1, _ := fileframe.NewDataFrameFromPath("data_2024_01.csv")
df2, _ := fileframe.NewDataFrameFromPath("data_2024_02.csv")
combined, err := df1.Concat(df2)

func (*DataFrame) Distinct ¶

func (df *DataFrame) Distinct() *DataFrame

Distinct returns a new DataFrame with duplicate rows removed. Two rows are considered duplicates if all their column values are equal.

Example:

unique := df.Distinct()

func (*DataFrame) DistinctBy ¶

func (df *DataFrame) DistinctBy(columns ...string) *DataFrame

DistinctBy returns a new DataFrame with duplicate rows removed based on the specified columns only.

Example:

unique := df.DistinctBy("name", "email")

func (*DataFrame) Drop ¶

func (df *DataFrame) Drop(columns ...string) *DataFrame

Drop returns a new DataFrame with the specified columns removed. Columns that do not exist are silently ignored.

Example:

dropped := df.Drop("temp_col", "debug_col")

func (*DataFrame) DropNA ¶

func (df *DataFrame) DropNA() *DataFrame

DropNA returns a new DataFrame with rows containing nil values removed. By default, removes rows where any column has a nil value.

Example:

cleaned := df.DropNA()

func (*DataFrame) DropNASubset ¶

func (df *DataFrame) DropNASubset(columns ...string) *DataFrame

DropNASubset returns a new DataFrame with rows removed where any of the specified columns have nil values.

Example:

cleaned := df.DropNASubset("required_field1", "required_field2")

func (*DataFrame) FillNA ¶

func (df *DataFrame) FillNA(value any) *DataFrame

FillNA returns a new DataFrame with nil values replaced by the specified value.

Example:

filled := df.FillNA(0)  // Replace all nil with 0

func (*DataFrame) FillNAByColumn ¶

func (df *DataFrame) FillNAByColumn(values map[string]any) *DataFrame

FillNAByColumn returns a new DataFrame with nil values replaced by column-specific values. Columns not in the map retain their nil values.

Example:

filled := df.FillNAByColumn(map[string]any{
    "age":    0,
    "name":   "Unknown",
    "active": false,
})

func (*DataFrame) Filter ¶

func (df *DataFrame) Filter(fn func(row map[string]any) bool) *DataFrame

Filter returns a new DataFrame containing only rows that satisfy the predicate. The predicate function receives a copy of each row to prevent accidental mutation of the original DataFrame.

Example:

filtered := df.Filter(func(row map[string]any) bool {
    age, ok := row["age"].(int64)
    return ok && age >= 18
})

func (*DataFrame) GroupBy ¶

func (df *DataFrame) GroupBy(columns ...string) (*GroupedDataFrame, error)

GroupBy groups the DataFrame by the specified columns. Returns a GroupedDataFrame that can be used with aggregation functions. Returns an error if any of the specified columns do not exist in the DataFrame.

Example:

grouped, err := df.GroupBy("category")
if err != nil {
    log.Fatal(err)
}
result := grouped.Sum("amount")

func (*DataFrame) Head ¶

func (df *DataFrame) Head(n int) *DataFrame

Head returns a new DataFrame with the first n rows. If n is greater than the number of rows, all rows are returned. If n is negative, returns an empty DataFrame.

Example:

first10 := df.Head(10)

func (*DataFrame) Join ¶

func (df *DataFrame) Join(other *DataFrame, opt JoinOption) (*DataFrame, error)

Join combines two DataFrames based on a common column or column pair. This method enables SQL-like join operations between DataFrames.

Join Types:

InnerJoin: Returns only matching rows from both DataFrames
LeftJoin: Returns all left rows, with nil for unmatched right columns
RightJoin: Returns all right rows, with nil for unmatched left columns
OuterJoin: Returns all rows from both, with nil for unmatched columns

Column Handling:

The join column from the right DataFrame is excluded from the result
Conflicting column names are prefixed with "right_"
Result column order: left columns first, then right columns

Limitations:

Currently supports joining on a single column pair (1 or 2 columns in On)
For complex joins with multiple keys, consider using filesql

Example - Inner Join with same column name:

users := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"id": 1, "name": "Alice"},
    {"id": 2, "name": "Bob"},
})
orders := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"id": 1, "product": "Laptop"},
    {"id": 1, "product": "Mouse"},
})
result, err := users.Join(orders, fileframe.JoinOption{
    On:  []string{"id"},
    How: fileframe.InnerJoin,
})
// Result: [{id:1, name:Alice, product:Laptop}, {id:1, name:Alice, product:Mouse}]

Example - Left Join with different column names:

users := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"user_id": 1, "name": "Alice"},
    {"user_id": 2, "name": "Bob"},
    {"user_id": 3, "name": "Charlie"},
})
orders := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"customer_id": 1, "product": "Laptop"},
})
result, err := users.Join(orders, fileframe.JoinOption{
    On:  []string{"user_id", "customer_id"},
    How: fileframe.LeftJoin,
})
// Result includes all 3 users; Bob and Charlie have nil for product

func (*DataFrame) Len ¶

func (df *DataFrame) Len() int

Len returns the number of rows in the DataFrame.

func (*DataFrame) Limit ¶

func (df *DataFrame) Limit(n int) *DataFrame

Limit is an alias for Head. Returns a new DataFrame with the first n rows.

Example:

limited := df.Limit(100)

func (*DataFrame) Mutate ¶

func (df *DataFrame) Mutate(column string, fn func(row map[string]any) any) *DataFrame

Mutate returns a new DataFrame with a new or modified column. The function receives a copy of each row and returns the value for the new column. The original DataFrame is not modified.

If the column name is empty or the function is nil, Mutate returns a clone of the original DataFrame without any modifications.

Example:

mutated := df.Mutate("full_name", func(row map[string]any) any {
    first := row["first_name"].(string)
    last := row["last_name"].(string)
    return first + " " + last
})

func (*DataFrame) Rename ¶

func (df *DataFrame) Rename(oldName, newName string) (*DataFrame, error)

Rename returns a new DataFrame with the specified column renamed. Returns an error if the old column does not exist or if the new column name already exists.

Example:

renamed, err := df.Rename("old_name", "new_name")

func (*DataFrame) RenameColumns ¶

func (df *DataFrame) RenameColumns(renames map[string]string) (*DataFrame, error)

RenameColumns returns a new DataFrame with multiple columns renamed. The renames map specifies old name -> new name mappings. Returns an error if any old column does not exist or if any new name conflicts.

Example:

renamed, err := df.RenameColumns(map[string]string{
    "col1": "column_one",
    "col2": "column_two",
})

func (*DataFrame) Select ¶

func (df *DataFrame) Select(columns ...string) *DataFrame

Select returns a new DataFrame with only the specified columns. Columns that do not exist are silently ignored.

Example:

selected := df.Select("name", "age")

func (*DataFrame) Sort ¶

func (df *DataFrame) Sort(column string, order SortOrder) (*DataFrame, error)

Sort returns a new DataFrame sorted by the specified column. Supports sorting by string, int64, and float64 values. Nil values are placed at the end regardless of sort order.

Example:

sorted := df.Sort("age", fileframe.Ascending)

func (*DataFrame) SortBy ¶

func (df *DataFrame) SortBy(options ...SortOption) (*DataFrame, error)

SortBy returns a new DataFrame sorted by multiple columns. Columns are sorted in the order specified (first column has highest priority).

Example:

sorted, err := df.SortBy(
    fileframe.SortOption{Column: "category", Order: fileframe.Ascending},
    fileframe.SortOption{Column: "price", Order: fileframe.Descending},
)

func (*DataFrame) Tail ¶

func (df *DataFrame) Tail(n int) *DataFrame

Tail returns a new DataFrame with the last n rows. If n is greater than the number of rows, all rows are returned. If n is negative, returns an empty DataFrame.

Example:

last10 := df.Tail(10)

func (*DataFrame) ToCSV ¶

func (df *DataFrame) ToCSV(path string) error

ToCSV writes the DataFrame to a CSV file.

Example:

err := df.ToCSV("output.csv")

func (*DataFrame) ToRecords ¶

func (df *DataFrame) ToRecords() []map[string]any

ToRecords returns the data as a slice of maps. Each map is a copy to ensure immutability.

func (*DataFrame) ToTSV ¶

func (df *DataFrame) ToTSV(path string) error

ToTSV writes the DataFrame to a TSV file.

Example:

err := df.ToTSV("output.tsv")

type FileType ¶

type FileType = fileparser.FileType

FileType represents supported file types including compression variants. This is an alias for fileparser.FileType.

type GroupedDataFrame ¶

type GroupedDataFrame struct {
	// contains filtered or unexported fields
}

GroupedDataFrame represents a DataFrame grouped by one or more columns.

func (*GroupedDataFrame) Agg ¶

func (gdf *GroupedDataFrame) Agg(column string, fn AggFunc) (*DataFrame, error)

Agg performs a custom aggregation on the specified column. The result column is named "agg_{column}". Returns an error if the specified column does not exist in the DataFrame.

Example:

median, err := grouped.Agg("amount", func(values []any) any {
    sorted := sortValues(values)
    return sorted[len(sorted)/2]
})

func (*GroupedDataFrame) Count ¶

func (gdf *GroupedDataFrame) Count() *DataFrame

Count returns a DataFrame with the count of rows in each group. The result column is named "count".

Example:

counts := df.GroupBy("category").Count()

func (*GroupedDataFrame) Max ¶

func (gdf *GroupedDataFrame) Max(column string) (*DataFrame, error)

Max returns a DataFrame with the maximum value in the specified column for each group. The result column is named "max_{column}". Returns an error if the specified column does not exist in the DataFrame.

Example:

maximums, err := df.GroupBy("category").Max("amount")

func (*GroupedDataFrame) Mean ¶

func (gdf *GroupedDataFrame) Mean(column string) (*DataFrame, error)

Mean returns a DataFrame with the mean of values in the specified column for each group. The result column is named "mean_{column}". Returns an error if the specified column does not exist in the DataFrame.

Example:

averages, err := df.GroupBy("category").Mean("amount")

func (*GroupedDataFrame) Min ¶

func (gdf *GroupedDataFrame) Min(column string) (*DataFrame, error)

Min returns a DataFrame with the minimum value in the specified column for each group. The result column is named "min_{column}". Returns an error if the specified column does not exist in the DataFrame.

Example:

minimums, err := df.GroupBy("category").Min("amount")

func (*GroupedDataFrame) Sum ¶

func (gdf *GroupedDataFrame) Sum(column string) (*DataFrame, error)

Sum returns a DataFrame with the sum of values in the specified column for each group. The result column is named "sum_{column}". Returns an error if the specified column does not exist in the DataFrame.

Example:

totals, err := df.GroupBy("category").Sum("amount")

type JoinOption ¶

type JoinOption struct {
	// On specifies the column(s) to join on.
	// If one column is specified, it is used for both DataFrames.
	// If two columns are specified, the first is for the left DataFrame and the second for the right.
	On []string
	// How specifies the type of join (InnerJoin, LeftJoin, RightJoin, OuterJoin).
	How JoinType
}

JoinOption specifies options for the Join operation.

On field specifies the join column(s):

One column: Used for both DataFrames (e.g., On: []string{"id"})
Two columns: First for left DataFrame, second for right (e.g., On: []string{"id", "user_id"})

How field specifies the join type (InnerJoin, LeftJoin, RightJoin, OuterJoin).

Example:

// Same column name in both DataFrames
opt := fileframe.JoinOption{On: []string{"id"}, How: fileframe.InnerJoin}

// Different column names
opt := fileframe.JoinOption{On: []string{"id", "user_id"}, How: fileframe.LeftJoin}

type JoinType ¶

type JoinType int

JoinType represents the type of join operation. Four join types are supported: InnerJoin, LeftJoin, RightJoin, and OuterJoin.

const (
	// InnerJoin returns only rows that have matching values in both DataFrames.
	// This is the most restrictive join type - rows without matches are excluded.
	//
	// Example: If users has ids [1, 2, 3] and orders has user_ids [1, 2, 4],
	// an inner join returns only rows for users 1 and 2.
	InnerJoin JoinType = iota

	// LeftJoin returns all rows from the left DataFrame and matched rows from the right DataFrame.
	// For left rows without matches, the right columns will have nil values.
	//
	// Example: If users has ids [1, 2, 3] and orders has user_ids [1, 2],
	// a left join returns all 3 users, with user 3 having nil for order columns.
	LeftJoin

	// RightJoin returns all rows from the right DataFrame and matched rows from the left DataFrame.
	// For right rows without matches, the left columns will have nil values.
	//
	// Example: If users has ids [1, 2] and orders has user_ids [1, 2, 3],
	// a right join returns all 3 orders, with order 3 having nil for user columns.
	RightJoin

	// OuterJoin returns all rows from both DataFrames.
	// Unmatched rows will have nil values for columns from the other DataFrame.
	// This is the most inclusive join type - no rows are excluded.
	//
	// Example: If users has ids [1, 2] and orders has user_ids [2, 3],
	// an outer join returns users 1, 2 and orders 2, 3 (4 rows total).
	OuterJoin
)

type SortOption ¶

type SortOption struct {
	// Column is the column name to sort by.
	Column string
	// Order specifies ascending or descending sort order.
	Order SortOrder
}

SortOption specifies options for the Sort operation.

type SortOrder ¶

type SortOrder int

SortOrder specifies the order for sorting.

const (
	// Ascending sorts values from smallest to largest.
	Ascending SortOrder = iota
	// Descending sorts values from largest to smallest.
	Descending
)

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

README ¶

fileframe

Features

Installation

Quick Start

1. Load Data

2. Transform Data

3. Aggregate Data

4. Export Data

Core Concepts

Immutability

Method Chaining

Error Handling

Common Operations

Filtering & Selection

Adding & Modifying Columns

Sorting & Deduplication

Row Selection

Missing Values

Joining DataFrames

Concatenating DataFrames

GroupBy & Aggregation

Complete Example

Supported File Formats

Supported Compression Formats

Performance

When to Use fileframe

Related Projects

Contributing

Support

License

Documentation ¶

Overview ¶

Design Philosophy ¶

Basic Usage ¶

Architecture ¶

Important Notes ¶

Index ¶

Examples ¶

Constants ¶

Variables ¶

Functions ¶

Types ¶

type AggFunc ¶

type DataFrame ¶

func ConcatAll ¶

func NewDataFrame ¶

func NewDataFrameFromPath ¶

func NewDataFrameFromRecords ¶

func (*DataFrame) Columns ¶

func (*DataFrame) Concat ¶

func (*DataFrame) Distinct ¶

func (*DataFrame) DistinctBy ¶

func (*DataFrame) Drop ¶

func (*DataFrame) DropNA ¶

func (*DataFrame) DropNASubset ¶

func (*DataFrame) FillNA ¶

func (*DataFrame) FillNAByColumn ¶

func (*DataFrame) Filter ¶

func (*DataFrame) GroupBy ¶

func (*DataFrame) Head ¶

func (*DataFrame) Join ¶

func (*DataFrame) Len ¶

func (*DataFrame) Limit ¶

func (*DataFrame) Mutate ¶

func (*DataFrame) Rename ¶

func (*DataFrame) RenameColumns ¶

func (*DataFrame) Select ¶

func (*DataFrame) Sort ¶

func (*DataFrame) SortBy ¶

func (*DataFrame) Tail ¶

func (*DataFrame) ToCSV ¶

func (*DataFrame) ToRecords ¶

func (*DataFrame) ToTSV ¶

type FileType ¶

type GroupedDataFrame ¶

func (*GroupedDataFrame) Agg ¶

func (*GroupedDataFrame) Count ¶

func (*GroupedDataFrame) Max ¶

func (*GroupedDataFrame) Mean ¶