fileframe

package module
v0.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 11, 2025 License: MIT Imports: 12 Imported by: 0

README

Go Reference Go Report Card MultiPlatformUnitTest Coverage

fileframe

logo

fileframe is a lightweight, immutable DataFrame library for Go. It provides intuitive data manipulation without the complexity of pandas-like APIs.

Features

  • Immutable operations - All methods return new DataFrames, never modify the original
  • Multiple file formats - CSV, TSV, LTSV, Parquet, XLSX
  • Compression support - gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4

Installation

go get github.com/nao1215/fileframe

Quick Start

1. Load Data
// From file (auto-detects format and compression)
df, err := fileframe.NewDataFrameFromPath("data.csv")
df, err := fileframe.NewDataFrameFromPath("data.csv.gz")  // compressed
df, err := fileframe.NewDataFrameFromPath("data.parquet") // Parquet

// From slice of maps
df := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"name": "Alice", "age": 30, "city": "Tokyo"},
    {"name": "Bob", "age": 25, "city": "Osaka"},
})

// From io.Reader
df, err := fileframe.NewDataFrame(reader, fileframe.CSV)
2. Transform Data
// All operations return NEW DataFrames (immutable)
result := df.
    Filter(func(row map[string]any) bool {
        age, _ := row["age"].(int64)
        return age >= 20
    }).
    Mutate("adult", func(row map[string]any) any {
        return true
    }).
    Select("name", "age", "adult")
3. Aggregate Data
grouped, err := df.GroupBy("city")
if err != nil {
    log.Fatal(err)
}

totals, err := grouped.Sum("sales")    // Returns (*DataFrame, error)
averages, err := grouped.Mean("price") // Returns (*DataFrame, error)
counts := grouped.Count()              // Returns *DataFrame
4. Export Data
err := df.ToCSV("output.csv")
err := df.ToTSV("output.tsv")
records := df.ToRecords() // []map[string]any

Core Concepts

Immutability

Every operation returns a new DataFrame. The original is never modified.

original := fileframe.NewDataFrameFromRecords(data)
filtered := original.Filter(fn)  // original is unchanged
mutated := filtered.Mutate(...)  // filtered is unchanged
Method Chaining

Chain multiple operations for clean, readable code:

result := df.
    FillNA(0).
    Filter(filterFn).
    Mutate("new_col", mutateFn).
    Select("col1", "col2", "new_col").
    Head(100)
Error Handling

Operations that can fail return (*DataFrame, error):

// These return errors
grouped, err := df.GroupBy("category")
sorted, err := df.Sort("price", fileframe.Descending)
joined, err := df.Join(other, opt)

// These never fail (return *DataFrame directly)
filtered := df.Filter(fn)
selected := df.Select("col1", "col2")
head := df.Head(10)

Common Operations

Filtering & Selection
// Filter rows
adults := df.Filter(func(row map[string]any) bool {
    age, ok := row["age"].(int64)
    return ok && age >= 18
})

// Select columns
subset := df.Select("name", "email", "phone")

// Drop columns
cleaned := df.Drop("internal_id", "debug_flag")
Adding & Modifying Columns
// Add new column
withTotal := df.Mutate("total", func(row map[string]any) any {
    qty, _ := row["quantity"].(int64)
    price, _ := row["price"].(float64)
    return float64(qty) * price
})

// Rename columns
renamed, err := df.Rename("old_name", "new_name")
renamed, err := df.RenameColumns(map[string]string{
    "col1": "column_one",
    "col2": "column_two",
})
Sorting & Deduplication
// Sort by single column
sorted, err := df.Sort("price", fileframe.Descending)

// Sort by multiple columns
sorted, err := df.SortBy(
    fileframe.SortOption{Column: "category", Order: fileframe.Ascending},
    fileframe.SortOption{Column: "price", Order: fileframe.Descending},
)

// Remove duplicates
unique := df.Distinct()
unique := df.DistinctBy("email") // by specific column
Row Selection
first10 := df.Head(10)
last5 := df.Tail(5)
limited := df.Limit(100) // alias for Head
Missing Values
// Remove rows with nil
cleaned := df.DropNA()
cleaned := df.DropNASubset("required_field")

// Fill nil values
filled := df.FillNA(0)
filled := df.FillNAByColumn(map[string]any{
    "name":   "Unknown",
    "age":    0,
    "active": false,
})
Joining DataFrames
users := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"id": 1, "name": "Alice"},
    {"id": 2, "name": "Bob"},
})

orders := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"user_id": 1, "product": "Laptop"},
    {"user_id": 1, "product": "Mouse"},
})

// Join types: InnerJoin, LeftJoin, RightJoin, OuterJoin
result, err := users.Join(orders, fileframe.JoinOption{
    On:  []string{"id", "user_id"}, // left column, right column
    How: fileframe.LeftJoin,
})
Concatenating DataFrames
// Same schema
combined, err := df1.Concat(df2, df3)

// Different schemas (union of columns, nil for missing)
combined, err := fileframe.ConcatAll(df1, df2, df3)
GroupBy & Aggregation
grouped, err := df.GroupBy("category")
if err != nil {
    log.Fatal(err)
}

// Built-in aggregations
counts := grouped.Count()              // *DataFrame
sums, err := grouped.Sum("amount")     // (*DataFrame, error)
means, err := grouped.Mean("price")    // (*DataFrame, error)
mins, err := grouped.Min("value")      // (*DataFrame, error)
maxs, err := grouped.Max("value")      // (*DataFrame, error)

// Custom aggregation
median, err := grouped.Agg("value", func(values []any) any {
    // Your aggregation logic here
    return computeMedian(values)
})

// Global aggregation (no grouping)
globalGrouped, _ := df.GroupBy()
totalSum, _ := globalGrouped.Sum("amount")

Complete Example

package main

import (
    "fmt"
    "log"

    "github.com/nao1215/fileframe"
)

func main() {
    // Load sales data
    df, err := fileframe.NewDataFrameFromPath("sales.csv")
    if err != nil {
        log.Fatal(err)
    }

    // Process data
    result := df.
        FillNAByColumn(map[string]any{"salesperson": "Unknown"}).
        Filter(func(row map[string]any) bool {
            amount, _ := row["amount"].(int64)
            return amount > 0
        }).
        Mutate("revenue", func(row map[string]any) any {
            qty, _ := row["quantity"].(int64)
            price, _ := row["price"].(int64)
            return qty * price
        })

    // Aggregate by region
    grouped, err := result.GroupBy("region")
    if err != nil {
        log.Fatal(err)
    }

    byRegion, err := grouped.Sum("revenue")
    if err != nil {
        log.Fatal(err)
    }

    // Sort and get top 3
    sorted, err := byRegion.Sort("sum_revenue", fileframe.Descending)
    if err != nil {
        log.Fatal(err)
    }
    top3 := sorted.Head(3)

    // Output results
    for _, row := range top3.ToRecords() {
        fmt.Printf("%s: %v\n", row["region"], row["sum_revenue"])
    }

    // Export
    if err := top3.ToCSV("top_regions.csv"); err != nil {
        log.Fatal(err)
    }
}

Supported File Formats

Format Read Write Compression
CSV Yes Yes gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4
TSV Yes Yes gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4
LTSV Yes - gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4
Parquet Yes - gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4
XLSX Yes - gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4
Supported Compression Formats
Format Extension Library Notes
gzip .gz compress/gzip Standard library
bzip2 .bz2 compress/bzip2 Standard library
xz .xz github.com/ulikunitz/xz Pure Go
zstd .zst github.com/klauspost/compress/zstd Pure Go, high performance
zlib .z compress/zlib Standard library
snappy .snappy github.com/klauspost/compress/snappy Pure Go, high performance
s2 .s2 github.com/klauspost/compress/s2 Snappy-compatible, faster
lz4 .lz4 github.com/pierrec/lz4/v4 Pure Go

Performance

Benchmarks on AMD RYZEN AI MAX+ 395:

Operation 100 rows 1,000 rows 10,000 rows
CSV Parse 140 µs 1.4 ms 5.1 ms
Filter 27 µs 304 µs 2.9 ms
Select 13 µs 110 µs 1.5 ms
Mutate 37 µs 332 µs 4.5 ms
GroupBy + Sum 7 µs 57 µs 635 µs

Memory usage (10,000 rows):

  • CSV Parse: 8.4 MB
  • Filter: 5.2 MB
  • GroupBy + Sum: 408 KB

When to Use fileframe

Use fileframe when:

  • Working with small to medium datasets (< 100,000 rows)
  • Need simple, readable data transformations
  • Want immutable operations for predictable code
  • Working with multiple file formats

Consider alternatives when:

  • Processing very large files (use filesql for streaming)
  • Need complex SQL-like queries (use filesql)
  • Require lazy evaluation

Contributing

Contributions are welcome! Please see the Contributing Guide for details.

Support

If you find this project useful:

License

MIT License - see LICENSE for details.

Documentation

Overview

Package fileframe provides a lightweight table utility that bridges fileprep and filesql.

fileframe is not a degraded copy of Pandas, but a practical tabular data manipulation tool that is idiomatic to Go. It follows the UNIX philosophy of doing one thing well.

Design Philosophy

  • Small: Do one thing well (UNIX philosophy)
  • Practical: Only features used in real data analysis
  • Simple and clear: API is self-explanatory
  • Intuitive: Natural Go-like coding style
  • Extensible: Complex features delegated to filesql

Basic Usage

// Create DataFrame from CSV
f, _ := os.Open("sales.csv")
defer f.Close()

df, err := fileframe.NewDataFrame(f, fileframe.CSV)
if err != nil {
    log.Fatal(err)
}

// Select columns and filter rows
result := df.
    Select("product", "amount", "category").
    Filter(func(row map[string]any) bool {
        amount, ok := row["amount"].(float64)
        return ok && amount > 1000
    })

// Group by and aggregate
grouped := result.
    GroupBy("category").
    Sum("amount")

// Output to CSV
grouped.ToCSV("summary.csv")

Architecture

fileframe sits between fileprep (preprocessing) and filesql (persistence/SQL):

  • Receives output from fileprep (io.Reader -> DataFrame)
  • Performs basic transformations (Select, Filter, Mutate, GroupBy)
  • Outputs to CSV or passes to filesql

For complex operations like Window functions, subqueries, or large-scale data processing, use filesql directly.

Important Notes

  • All operations execute immediately (no lazy evaluation)
  • Target scale: Small to medium data (under 100,000 rows)
  • Row-oriented design with []map[string]any
  • All methods return new DataFrames (immutable operations)
Example

Example demonstrates basic DataFrame operations: reading CSV, filtering rows, grouping, and aggregating data.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Sample sales data
	csvData := `product,amount,category
Apple,100,Fruit
Banana,150,Fruit
Carrot,80,Vegetable
Orange,120,Fruit
Broccoli,90,Vegetable`

	// Create DataFrame from CSV
	df, err := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV)
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	fmt.Printf("Total rows: %d\n", df.Len())
	fmt.Printf("Columns: %v\n", df.Columns())

	// Filter: only items with amount > 100
	filtered := df.Filter(func(row map[string]any) bool {
		amount, ok := row["amount"].(int64)
		return ok && amount > 100
	})
	fmt.Printf("Rows with amount > 100: %d\n", filtered.Len())

	// GroupBy category and sum amounts
	groupedDf, err := df.GroupBy("category")
	if err != nil {
		fmt.Println("Error:", err)
		return
	}
	grouped, err := groupedDf.Sum("amount")
	if err != nil {
		fmt.Println("Error:", err)
		return
	}
	fmt.Printf("Grouped columns: %v\n", grouped.Columns())

	// Show grouped results
	for _, row := range grouped.ToRecords() {
		fmt.Printf("  %s: %.0f\n", row["category"], row["sum_amount"])
	}

}
Output:
Total rows: 5
Columns: [product amount category]
Rows with amount > 100: 2
Grouped columns: [category sum_amount]
  Fruit: 370
  Vegetable: 170
Example (ComplexOperations)

Example_complexOperations demonstrates advanced DataFrame operations including multiple aggregations, data transformation with Mutate, and combining results from different DataFrames.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Sales data
	salesCSV := `order_id,product_id,quantity,unit_price
1,P001,2,100
2,P002,1,200
3,P001,3,100
4,P003,5,50
5,P002,2,200`

	// Product master data
	productsCSV := `product_id,name,category
P001,Laptop Stand,Electronics
P002,Mechanical Keyboard,Electronics
P003,Notebook,Stationery`

	// Create DataFrames
	sales, _ := fileframe.NewDataFrame(strings.NewReader(salesCSV), fileframe.CSV)       //nolint:errcheck // example code
	products, _ := fileframe.NewDataFrame(strings.NewReader(productsCSV), fileframe.CSV) //nolint:errcheck // example code

	// Add calculated column: total_amount = quantity * unit_price
	salesWithTotal := sales.Mutate("total_amount", func(row map[string]any) any {
		qty, _ := row["quantity"].(int64)     //nolint:errcheck // example code
		price, _ := row["unit_price"].(int64) //nolint:errcheck // example code
		return qty * price
	})

	fmt.Println("=== Sales with Total Amount ===")
	for _, row := range salesWithTotal.ToRecords() {
		fmt.Printf("Order %v: %v x %v = %v\n",
			row["order_id"], row["quantity"], row["unit_price"], row["total_amount"])
	}

	// Aggregate sales by product_id
	salesByProductGrp, _ := salesWithTotal.GroupBy("product_id") //nolint:errcheck // example code
	salesByProduct, _ := salesByProductGrp.Sum("total_amount")   //nolint:errcheck // example code

	fmt.Println("\n=== Sales by Product ===")
	for _, row := range salesByProduct.ToRecords() {
		fmt.Printf("%s: %.0f\n", row["product_id"], row["sum_total_amount"])
	}

	// Create a lookup map from products DataFrame
	productLookup := make(map[string]map[string]any)
	for _, row := range products.ToRecords() {
		pid, _ := row["product_id"].(string) //nolint:errcheck // example code
		productLookup[pid] = row
	}

	// Combine sales summary with product info (manual join)
	combinedRecords := make([]map[string]any, 0)
	for _, salesRow := range salesByProduct.ToRecords() {
		pid, _ := salesRow["product_id"].(string) //nolint:errcheck // example code
		if productInfo, exists := productLookup[pid]; exists {
			combined := map[string]any{
				"product_id":  pid,
				"name":        productInfo["name"],
				"category":    productInfo["category"],
				"total_sales": salesRow["sum_total_amount"],
			}
			combinedRecords = append(combinedRecords, combined)
		}
	}
	combined := fileframe.NewDataFrameFromRecords(combinedRecords)

	fmt.Println("\n=== Combined Sales Report ===")
	fmt.Printf("Columns: %v\n", combined.Columns())
	for _, row := range combined.ToRecords() {
		fmt.Printf("%s (%s): %.0f\n",
			row["name"], row["category"], row["total_sales"])
	}

	// Group combined data by category
	byCategoryGrp, _ := combined.GroupBy("category")  //nolint:errcheck // example code
	byCategory, _ := byCategoryGrp.Sum("total_sales") //nolint:errcheck // example code

	fmt.Println("\n=== Total Sales by Category ===")
	for _, row := range byCategory.ToRecords() {
		fmt.Printf("%s: %.0f\n", row["category"], row["sum_total_sales"])
	}

	// Calculate statistics
	fmt.Println("\n=== Sales Statistics ===")
	statsGrp, _ := salesWithTotal.GroupBy()  //nolint:errcheck // example code
	stats, _ := statsGrp.Sum("total_amount") //nolint:errcheck // example code
	for _, row := range stats.ToRecords() {
		fmt.Printf("Total Revenue: %.0f\n", row["sum_total_amount"])
	}

	meanSalesGrp, _ := salesWithTotal.GroupBy()       //nolint:errcheck // example code
	meanSales, _ := meanSalesGrp.Mean("total_amount") //nolint:errcheck // example code
	for _, row := range meanSales.ToRecords() {
		fmt.Printf("Average Order Value: %.0f\n", row["mean_total_amount"])
	}

	minSalesGrp, _ := salesWithTotal.GroupBy()     //nolint:errcheck // example code
	minSales, _ := minSalesGrp.Min("total_amount") //nolint:errcheck // example code
	for _, row := range minSales.ToRecords() {
		fmt.Printf("Min Order: %.0f\n", row["min_total_amount"])
	}

	maxSalesGrp, _ := salesWithTotal.GroupBy()     //nolint:errcheck // example code
	maxSales, _ := maxSalesGrp.Max("total_amount") //nolint:errcheck // example code
	for _, row := range maxSales.ToRecords() {
		fmt.Printf("Max Order: %.0f\n", row["max_total_amount"])
	}

}
Output:
=== Sales with Total Amount ===
Order 1: 2 x 100 = 200
Order 2: 1 x 200 = 200
Order 3: 3 x 100 = 300
Order 4: 5 x 50 = 250
Order 5: 2 x 200 = 400

=== Sales by Product ===
P001: 500
P002: 600
P003: 250

=== Combined Sales Report ===
Columns: [category name product_id total_sales]
Laptop Stand (Electronics): 500
Mechanical Keyboard (Electronics): 600
Notebook (Stationery): 250

=== Total Sales by Category ===
Electronics: 1100
Stationery: 250

=== Sales Statistics ===
Total Revenue: 1350
Average Order Value: 270
Min Order: 200
Max Order: 400
Example (Concat)

Example_concat demonstrates vertical concatenation of DataFrames with the same schema. Use Concat when combining data from multiple sources with identical columns.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Sales data from different regions (same schema)
	tokyoCSV := `region,product,sales
Tokyo,Laptop,100
Tokyo,Mouse,300`

	osakaCSV := `region,product,sales
Osaka,Laptop,80
Osaka,Mouse,250
Osaka,Keyboard,120`

	tokyo, _ := fileframe.NewDataFrame(strings.NewReader(tokyoCSV), fileframe.CSV) //nolint:errcheck
	osaka, _ := fileframe.NewDataFrame(strings.NewReader(osakaCSV), fileframe.CSV) //nolint:errcheck

	// Concat requires identical columns
	combined, err := tokyo.Concat(osaka)
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	fmt.Printf("Tokyo rows: %d\n", tokyo.Len())
	fmt.Printf("Osaka rows: %d\n", osaka.Len())
	fmt.Printf("Combined rows: %d\n", combined.Len())

	// Now we can analyze the combined data
	grouped, _ := combined.GroupBy("product") //nolint:errcheck
	totals, _ := grouped.Sum("sales")         //nolint:errcheck

	// Sort by product name for deterministic output
	sortedTotals, _ := totals.Sort("product", fileframe.Ascending) //nolint:errcheck

	fmt.Println("\nTotal sales by product:")
	for _, row := range sortedTotals.ToRecords() {
		fmt.Printf("  %s: %.0f\n", row["product"], row["sum_sales"])
	}

}
Output:
Tokyo rows: 2
Osaka rows: 3
Combined rows: 5

Total sales by product:
  Keyboard: 120
  Laptop: 180
  Mouse: 550
Example (ConcatAll)

Example_concatAll demonstrates flexible concatenation of DataFrames with different schemas. ConcatAll automatically handles different column sets by creating a union of all columns.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Data from 2023 - basic schema
	data2023CSV := `year,product,sales
2023,Laptop,1000
2023,Mouse,500`

	// Data from 2024 - added "region" column
	data2024CSV := `year,product,sales,region
2024,Laptop,1200,Tokyo
2024,Mouse,600,Osaka
2024,Keyboard,300,Tokyo`

	df2023, _ := fileframe.NewDataFrame(strings.NewReader(data2023CSV), fileframe.CSV) //nolint:errcheck
	df2024, _ := fileframe.NewDataFrame(strings.NewReader(data2024CSV), fileframe.CSV) //nolint:errcheck

	fmt.Printf("2023 columns: %v\n", df2023.Columns())
	fmt.Printf("2024 columns: %v\n", df2024.Columns())

	// ConcatAll handles different schemas
	combined, err := fileframe.ConcatAll(df2023, df2024)
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	fmt.Printf("Combined columns: %v\n", combined.Columns())
	fmt.Printf("Combined rows: %d\n", combined.Len())

	// 2023 data will have nil for "region"
	fmt.Println("\nCombined data:")
	for _, row := range combined.ToRecords() {
		region := row["region"]
		if region == nil {
			region = "(no region)"
		}
		// Handle int64 vs float64 for sales column
		var sales float64
		switch v := row["sales"].(type) {
		case int64:
			sales = float64(v)
		case float64:
			sales = v
		}
		fmt.Printf("  %v %s: %.0f - %v\n",
			row["year"], row["product"], sales, region)
	}

}
Output:
2023 columns: [year product sales]
2024 columns: [year product sales region]
Combined columns: [product region sales year]
Combined rows: 5

Combined data:
  2023 Laptop: 1000 - (no region)
  2023 Mouse: 500 - (no region)
  2024 Laptop: 1200 - Tokyo
  2024 Mouse: 600 - Osaka
  2024 Keyboard: 300 - Tokyo
Example (CustomAggregation)

Example_customAggregation demonstrates how to use the Agg function to implement custom aggregation logic such as median calculation.

package main

import (
	"fmt"
	"slices"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	csvData := `category,value
A,10
A,20
A,30
A,40
A,50
B,5
B,15
B,25`

	df, err := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV)
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	grouped, err := df.GroupBy("category")
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	// Custom aggregation: calculate median
	median := func(values []any) any {
		// Filter and convert to float64
		nums := make([]float64, 0, len(values))
		for _, v := range values {
			switch n := v.(type) {
			case int64:
				nums = append(nums, float64(n))
			case float64:
				nums = append(nums, n)
			}
		}
		if len(nums) == 0 {
			return nil
		}

		// Sort values
		slices.Sort(nums)

		// Calculate median
		mid := len(nums) / 2
		if len(nums)%2 == 0 {
			return (nums[mid-1] + nums[mid]) / 2
		}
		return nums[mid]
	}

	result, _ := grouped.Agg("value", median) //nolint:errcheck // example code

	fmt.Println("Median by category:")
	for _, row := range result.ToRecords() {
		fmt.Printf("  %s: %.1f\n", row["category"], row["agg_value"])
	}

	// Custom aggregation: calculate range (max - min)
	rangeFunc := func(values []any) any {
		var minVal, maxVal float64
		first := true
		for _, v := range values {
			var n float64
			switch val := v.(type) {
			case int64:
				n = float64(val)
			case float64:
				n = val
			default:
				continue
			}
			if first {
				minVal, maxVal = n, n
				first = false
			} else {
				if n < minVal {
					minVal = n
				}
				if n > maxVal {
					maxVal = n
				}
			}
		}
		if first {
			return nil
		}
		return maxVal - minVal
	}

	rangeResult, _ := grouped.Agg("value", rangeFunc) //nolint:errcheck // example code

	fmt.Println("Range by category:")
	for _, row := range rangeResult.ToRecords() {
		fmt.Printf("  %s: %.0f\n", row["category"], row["agg_value"])
	}

}
Output:
Median by category:
  A: 30.0
  B: 15.0
Range by category:
  A: 40
  B: 20
Example (DataframePipeline)

Example_dataframePipeline demonstrates chaining multiple DataFrame operations to build a complete data processing pipeline.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Raw sales data with some issues
	salesCSV := `date,region,product,quantity,price,salesperson
2024-01-15,Tokyo,Laptop,2,1000,Alice
2024-01-15,Tokyo,Mouse,10,25,Alice
2024-01-16,Osaka,Laptop,1,1000,Bob
2024-01-16,Osaka,Keyboard,5,75,
2024-01-17,Tokyo,Monitor,3,300,Charlie
2024-01-17,Nagoya,Mouse,8,25,Diana`

	df, _ := fileframe.NewDataFrame(strings.NewReader(salesCSV), fileframe.CSV) //nolint:errcheck

	// Pipeline: Clean -> Transform -> Aggregate -> Sort -> Limit
	result := df.
		// 1. Fill missing salesperson
		FillNAByColumn(map[string]any{"salesperson": "Unknown"}).
		// 2. Add calculated column
		Mutate("revenue", func(row map[string]any) any {
			qty, _ := row["quantity"].(int64) //nolint:errcheck
			price, _ := row["price"].(int64)  //nolint:errcheck
			return float64(qty) * float64(price)
		}).
		// 3. Select relevant columns
		Select("region", "product", "revenue", "salesperson")

	// Group by region and sum revenue
	grouped, _ := result.GroupBy("region") //nolint:errcheck
	byRegion, _ := grouped.Sum("revenue")  //nolint:errcheck

	// Sort by revenue descending
	sorted, _ := byRegion.Sort("sum_revenue", fileframe.Descending) //nolint:errcheck

	fmt.Println("Revenue by Region (Top to Bottom):")
	for _, row := range sorted.ToRecords() {
		fmt.Printf("  %s: $%.0f\n", row["region"], row["sum_revenue"])
	}

	// Also get top 3 individual sales
	topSales, _ := result.Sort("revenue", fileframe.Descending) //nolint:errcheck
	fmt.Println("\nTop 3 Sales:")
	for _, row := range topSales.Head(3).ToRecords() {
		fmt.Printf("  %s in %s: $%.0f (by %s)\n",
			row["product"], row["region"], row["revenue"], row["salesperson"])
	}

}
Output:
Revenue by Region (Top to Bottom):
  Tokyo: $3150
  Osaka: $1375
  Nagoya: $200

Top 3 Sales:
  Laptop in Tokyo: $2000 (by Alice)
  Laptop in Osaka: $1000 (by Bob)
  Monitor in Tokyo: $900 (by Charlie)
Example (DropRename)

Example_dropRename demonstrates column manipulation operations.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	csvData := `user_id,first_name,last_name,internal_code,email
1,Alice,Smith,X123,alice@example.com
2,Bob,Jones,X456,bob@example.com`

	df, _ := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV) //nolint:errcheck

	fmt.Printf("Original columns: %v\n", df.Columns())

	// Drop internal column
	cleaned := df.Drop("internal_code")
	fmt.Printf("After Drop: %v\n", cleaned.Columns())

	// Rename columns for clarity
	renamed, _ := cleaned.RenameColumns(map[string]string{ //nolint:errcheck
		"first_name": "first",
		"last_name":  "last",
	})
	fmt.Printf("After Rename: %v\n", renamed.Columns())

	// Single column rename
	final, _ := renamed.Rename("user_id", "id") //nolint:errcheck
	fmt.Printf("Final columns: %v\n", final.Columns())

}
Output:
Original columns: [user_id first_name last_name internal_code email]
After Drop: [user_id first_name last_name email]
After Rename: [user_id first last email]
Final columns: [id first last email]
Example (FileFormats)

Example_fileFormats shows the various file formats supported by fileframe. NewDataFrameFromPath automatically detects file type and handles compression.

Supported formats:

  • CSV, TSV, LTSV, XLSX, Parquet
  • Compressed variants: .gz, .bz2, .xz, .zst, .z, .snappy, .s2, .lz4

Usage:

// Auto-detect CSV
df, err := fileframe.NewDataFrameFromPath("data.csv")

// Auto-detect compressed CSV (gzip)
df, err := fileframe.NewDataFrameFromPath("data.csv.gz")

// Auto-detect TSV with zstd compression
df, err := fileframe.NewDataFrameFromPath("data.tsv.zst")

// Auto-detect CSV with snappy compression
df, err := fileframe.NewDataFrameFromPath("data.csv.snappy")

// Auto-detect TSV with lz4 compression
df, err := fileframe.NewDataFrameFromPath("data.tsv.lz4")

// Auto-detect Excel file
df, err := fileframe.NewDataFrameFromPath("spreadsheet.xlsx")

// Auto-detect Parquet file
df, err := fileframe.NewDataFrameFromPath("data.parquet")
package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// This example demonstrates the API for reading various file formats.
	// Since Example functions require deterministic output, we show
	// equivalent operations using NewDataFrame with explicit file types.

	// TSV (Tab-Separated Values)
	tsvData := "name\tage\tcity\nAlice\t30\tTokyo\nBob\t25\tOsaka"
	dfTSV, _ := fileframe.NewDataFrame(strings.NewReader(tsvData), fileframe.TSV) //nolint:errcheck
	fmt.Printf("TSV columns: %v, rows: %d\n", dfTSV.Columns(), dfTSV.Len())

	// LTSV (Labeled Tab-Separated Values)
	ltsvData := "name:Alice\tage:30\tcity:Tokyo\nname:Bob\tage:25\tcity:Osaka"
	dfLTSV, _ := fileframe.NewDataFrame(strings.NewReader(ltsvData), fileframe.LTSV) //nolint:errcheck
	fmt.Printf("LTSV columns: %v, rows: %d\n", dfLTSV.Columns(), dfLTSV.Len())

	// For file-based operations with compression, use NewDataFrameFromPath:
	//
	//   df, err := fileframe.NewDataFrameFromPath("logs.csv.gz")      // gzip
	//   df, err := fileframe.NewDataFrameFromPath("data.tsv.bz2")     // bzip2
	//   df, err := fileframe.NewDataFrameFromPath("export.csv.xz")    // xz
	//   df, err := fileframe.NewDataFrameFromPath("archive.csv.zst")  // zstd
	//   df, err := fileframe.NewDataFrameFromPath("data.csv.z")       // zlib
	//   df, err := fileframe.NewDataFrameFromPath("data.csv.snappy")  // snappy
	//   df, err := fileframe.NewDataFrameFromPath("data.csv.s2")      // s2
	//   df, err := fileframe.NewDataFrameFromPath("data.csv.lz4")     // lz4

}
Output:
TSV columns: [name age city], rows: 2
LTSV columns: [name age city], rows: 2
Example (GlobalAggregation)

Example_globalAggregation demonstrates how to calculate global statistics without grouping by any column. Call GroupBy() with no arguments to aggregate the entire DataFrame into a single result.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	csvData := `product,price,quantity
Laptop,1000,5
Mouse,25,50
Keyboard,75,30
Monitor,300,10`

	df, err := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV)
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	// GroupBy() with no arguments = global aggregation (entire DataFrame as one group)
	grouped, err := df.GroupBy()
	if err != nil {
		fmt.Println("Error:", err)
		return
	}

	// Calculate various statistics for the entire dataset
	count := grouped.Count()
	fmt.Printf("Total products: %d\n", count.ToRecords()[0]["count"])

	sumResult, _ := grouped.Sum("price") //nolint:errcheck // example code
	fmt.Printf("Sum of prices: %.0f\n", sumResult.ToRecords()[0]["sum_price"])

	meanResult, _ := grouped.Mean("quantity") //nolint:errcheck // example code
	fmt.Printf("Average quantity: %.2f\n", meanResult.ToRecords()[0]["mean_quantity"])

	minResult, _ := grouped.Min("price") //nolint:errcheck // example code
	fmt.Printf("Min price: %.0f\n", minResult.ToRecords()[0]["min_price"])

	maxResult, _ := grouped.Max("price") //nolint:errcheck // example code
	fmt.Printf("Max price: %.0f\n", maxResult.ToRecords()[0]["max_price"])

}
Output:
Total products: 4
Sum of prices: 1400
Average quantity: 23.75
Min price: 25
Max price: 1000
Example (HandleMissingValues)

Example_handleMissingValues demonstrates DropNA and FillNA operations.

package main

import (
	"fmt"

	"github.com/nao1215/fileframe"
)

func main() {
	// Create DataFrame with nil values
	records := []map[string]any{
		{"name": "Alice", "age": int64(30), "city": "Tokyo"},
		{"name": "Bob", "age": nil, "city": "Osaka"},
		{"name": nil, "age": int64(25), "city": nil},
		{"name": "Diana", "age": int64(35), "city": "Kyoto"},
	}
	df := fileframe.NewDataFrameFromRecords(records)

	fmt.Printf("Original rows: %d\n", df.Len())

	// DropNA: Remove rows with any nil values
	cleaned := df.DropNA()
	fmt.Printf("After DropNA: %d rows\n", cleaned.Len())

	// DropNASubset: Remove rows with nil only in specific columns
	partialClean := df.DropNASubset("name")
	fmt.Printf("After DropNASubset(name): %d rows\n", partialClean.Len())

	// FillNA: Replace all nil values with a default
	filled := df.FillNA("Unknown")
	fmt.Println("\nAfter FillNA('Unknown'):")
	for _, row := range filled.ToRecords() {
		fmt.Printf("  %v, %v, %v\n", row["name"], row["age"], row["city"])
	}

	// FillNAByColumn: Different defaults per column
	smartFilled := df.FillNAByColumn(map[string]any{
		"name": "Anonymous",
		"age":  int64(0),
		"city": "Unknown",
	})
	fmt.Println("\nAfter FillNAByColumn:")
	for _, row := range smartFilled.ToRecords() {
		fmt.Printf("  %v, %v, %v\n", row["name"], row["age"], row["city"])
	}

}
Output:
Original rows: 4
After DropNA: 2 rows
After DropNASubset(name): 3 rows

After FillNA('Unknown'):
  Alice, 30, Tokyo
  Bob, Unknown, Osaka
  Unknown, 25, Unknown
  Diana, 35, Kyoto

After FillNAByColumn:
  Alice, 30, Tokyo
  Bob, 0, Osaka
  Anonymous, 25, Unknown
  Diana, 35, Kyoto
Example (HeadTailLimit)

Example_headTailLimit demonstrates row selection operations.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	csvData := `id,value
1,100
2,200
3,300
4,400
5,500
6,600
7,700`

	df, _ := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV) //nolint:errcheck

	fmt.Printf("Total rows: %d\n", df.Len())

	// Get first 3 rows
	head := df.Head(3)
	fmt.Printf("\nHead(3) - first 3 rows:\n")
	for _, row := range head.ToRecords() {
		fmt.Printf("  id=%v, value=%v\n", row["id"], row["value"])
	}

	// Get last 2 rows
	tail := df.Tail(2)
	fmt.Printf("\nTail(2) - last 2 rows:\n")
	for _, row := range tail.ToRecords() {
		fmt.Printf("  id=%v, value=%v\n", row["id"], row["value"])
	}

	// Limit is alias for Head - useful for SQL-like syntax
	limited := df.Limit(2)
	fmt.Printf("\nLimit(2) rows: %d\n", limited.Len())

}
Output:
Total rows: 7

Head(3) - first 3 rows:
  id=1, value=100
  id=2, value=200
  id=3, value=300

Tail(2) - last 2 rows:
  id=6, value=600
  id=7, value=700

Limit(2) rows: 2
Example (Join)

Example_join demonstrates how to combine two DataFrames using Join. This is similar to SQL JOIN operations and supports inner, left, right, and outer joins.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Users table
	usersCSV := `id,name,department
1,Alice,Engineering
2,Bob,Marketing
3,Charlie,Engineering
4,Diana,Sales`

	// Orders table - note that user_id references users.id
	ordersCSV := `order_id,user_id,product,amount
101,1,Laptop,1200
102,1,Mouse,50
103,2,Monitor,400
104,5,Keyboard,100`

	users, _ := fileframe.NewDataFrame(strings.NewReader(usersCSV), fileframe.CSV)   //nolint:errcheck
	orders, _ := fileframe.NewDataFrame(strings.NewReader(ordersCSV), fileframe.CSV) //nolint:errcheck

	// Inner Join: Only users who have orders
	inner, _ := users.Join(orders, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"id", "user_id"}, // Left column, Right column
		How: fileframe.InnerJoin,
	})
	fmt.Println("=== Inner Join (users with orders) ===")
	fmt.Printf("Rows: %d\n", inner.Len())
	for _, row := range inner.ToRecords() {
		fmt.Printf("  %s ordered %s ($%v)\n", row["name"], row["product"], row["amount"])
	}

	// Left Join: All users, with order info if available
	left, _ := users.Join(orders, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"id", "user_id"},
		How: fileframe.LeftJoin,
	})
	fmt.Println("\n=== Left Join (all users) ===")
	fmt.Printf("Rows: %d\n", left.Len())

	// Count users without orders
	noOrders := 0
	for _, row := range left.ToRecords() {
		if row["order_id"] == nil {
			noOrders++
		}
	}
	fmt.Printf("Users without orders: %d\n", noOrders)

}
Output:
=== Inner Join (users with orders) ===
Rows: 3
  Alice ordered Laptop ($1200)
  Alice ordered Mouse ($50)
  Bob ordered Monitor ($400)

=== Left Join (all users) ===
Rows: 5
Users without orders: 2
Example (JoinTypes)

Example_joinTypes demonstrates all four join types: InnerJoin, LeftJoin, RightJoin, and OuterJoin.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	// Left DataFrame: Products
	productsCSV := `product_id,name
P1,Laptop
P2,Mouse
P3,Keyboard`

	// Right DataFrame: Inventory
	inventoryCSV := `item_id,quantity,warehouse
P1,50,Tokyo
P2,200,Osaka
P4,30,Tokyo`

	products, _ := fileframe.NewDataFrame(strings.NewReader(productsCSV), fileframe.CSV)   //nolint:errcheck
	inventory, _ := fileframe.NewDataFrame(strings.NewReader(inventoryCSV), fileframe.CSV) //nolint:errcheck

	// Inner Join: Only products in inventory
	inner, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"product_id", "item_id"},
		How: fileframe.InnerJoin,
	})
	fmt.Printf("Inner Join: %d rows (products in inventory)\n", inner.Len())

	// Left Join: All products, with inventory if exists
	left, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"product_id", "item_id"},
		How: fileframe.LeftJoin,
	})
	fmt.Printf("Left Join: %d rows (all products)\n", left.Len())

	// Right Join: All inventory items, with product info if exists
	right, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"product_id", "item_id"},
		How: fileframe.RightJoin,
	})
	fmt.Printf("Right Join: %d rows (all inventory)\n", right.Len())

	// Outer Join: Everything from both
	outer, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
		On:  []string{"product_id", "item_id"},
		How: fileframe.OuterJoin,
	})
	fmt.Printf("Outer Join: %d rows (all products + all inventory)\n", outer.Len())

}
Output:
Inner Join: 2 rows (products in inventory)
Left Join: 3 rows (all products)
Right Join: 3 rows (all inventory)
Outer Join: 4 rows (all products + all inventory)
Example (SortAndDistinct)

Example_sortAndDistinct demonstrates sorting and deduplication operations.

package main

import (
	"fmt"
	"strings"

	"github.com/nao1215/fileframe"
)

func main() {
	csvData := `name,category,score
Alice,A,85
Bob,B,90
Charlie,A,85
Alice,A,85
Diana,B,75
Eve,A,95`

	df, _ := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV) //nolint:errcheck

	fmt.Printf("Original rows: %d\n", df.Len())

	// Remove duplicate rows
	unique := df.Distinct()
	fmt.Printf("After Distinct: %d rows\n", unique.Len())

	// Sort by score descending
	sorted, _ := unique.Sort("score", fileframe.Descending) //nolint:errcheck
	fmt.Println("\nTop scores:")
	for _, row := range sorted.Head(3).ToRecords() {
		fmt.Printf("  %s: %v\n", row["name"], row["score"])
	}

	// Sort by multiple columns: category ascending, then score descending
	multiSorted, _ := unique.SortBy( //nolint:errcheck
		fileframe.SortOption{Column: "category", Order: fileframe.Ascending},
		fileframe.SortOption{Column: "score", Order: fileframe.Descending},
	)
	fmt.Println("\nBy category, then score:")
	for _, row := range multiSorted.ToRecords() {
		fmt.Printf("  [%s] %s: %v\n", row["category"], row["name"], row["score"])
	}

}
Output:
Original rows: 6
After Distinct: 5 rows

Top scores:
  Eve: 95
  Bob: 90
  Alice: 85

By category, then score:
  [A] Eve: 95
  [A] Alice: 85
  [A] Charlie: 85
  [B] Bob: 90
  [B] Diana: 75

Index

Examples

Constants

View Source
const (
	// CSV represents CSV file type
	CSV = fileparser.CSV
	// TSV represents TSV file type
	TSV = fileparser.TSV
	// LTSV represents LTSV file type
	LTSV = fileparser.LTSV
	// Parquet represents Parquet file type
	Parquet = fileparser.Parquet
	// XLSX represents Excel XLSX file type
	XLSX = fileparser.XLSX

	// Compressed CSV variants
	CSVGZ   = fileparser.CSVGZ
	CSVBZ2  = fileparser.CSVBZ2
	CSVXZ   = fileparser.CSVXZ
	CSVZSTD = fileparser.CSVZSTD

	// Compressed TSV variants
	TSVGZ   = fileparser.TSVGZ
	TSVBZ2  = fileparser.TSVBZ2
	TSVXZ   = fileparser.TSVXZ
	TSVZSTD = fileparser.TSVZSTD

	// Compressed LTSV variants
	LTSVGZ   = fileparser.LTSVGZ
	LTSVBZ2  = fileparser.LTSVBZ2
	LTSVXZ   = fileparser.LTSVXZ
	LTSVZSTD = fileparser.LTSVZSTD

	// Compressed Parquet variants
	ParquetGZ   = fileparser.ParquetGZ
	ParquetBZ2  = fileparser.ParquetBZ2
	ParquetXZ   = fileparser.ParquetXZ
	ParquetZSTD = fileparser.ParquetZSTD

	// Compressed XLSX variants
	XLSXGZ   = fileparser.XLSXGZ
	XLSXBZ2  = fileparser.XLSXBZ2
	XLSXXZ   = fileparser.XLSXXZ
	XLSXZSTD = fileparser.XLSXZSTD

	// ZLIB compressed variants
	CSVZLIB     = fileparser.CSVZLIB
	TSVZLIB     = fileparser.TSVZLIB
	LTSVZLIB    = fileparser.LTSVZLIB
	ParquetZLIB = fileparser.ParquetZLIB
	XLSXZLIB    = fileparser.XLSXZLIB

	// Snappy compressed variants
	CSVSNAPPY     = fileparser.CSVSNAPPY
	TSVSNAPPY     = fileparser.TSVSNAPPY
	LTSVSNAPPY    = fileparser.LTSVSNAPPY
	ParquetSNAPPY = fileparser.ParquetSNAPPY
	XLSXSNAPPY    = fileparser.XLSXSNAPPY

	// S2 compressed variants
	CSVS2     = fileparser.CSVS2
	TSVS2     = fileparser.TSVS2
	LTSVS2    = fileparser.LTSVS2
	ParquetS2 = fileparser.ParquetS2
	XLSXS2    = fileparser.XLSXS2

	// LZ4 compressed variants
	CSVLZ4     = fileparser.CSVLZ4
	TSVLZ4     = fileparser.TSVLZ4
	LTSVLZ4    = fileparser.LTSVLZ4
	ParquetLZ4 = fileparser.ParquetLZ4
	XLSXLZ4    = fileparser.XLSXLZ4
)

Supported file types (re-exported from fileparser)

Variables

View Source
var ErrColumnNotFound = errors.New("column not found")

ErrColumnNotFound is returned when a specified column does not exist in the DataFrame.

Functions

This section is empty.

Types

type AggFunc

type AggFunc func(values []any) any

AggFunc is a function type for custom aggregation. It receives a slice of values from the same group and returns the aggregated result.

var AggCount AggFunc = func(values []any) any {
	count := 0
	for _, v := range values {
		if v != nil {
			count++
		}
	}
	return int64(count)
}

AggCount counts the number of non-nil values. Returns int64 count. Note: this counts all non-nil values, not just numeric ones.

var AggMax AggFunc = func(values []any) any {
	maxVal := -math.MaxFloat64
	found := false
	for _, v := range values {
		if f, ok := toFloat64(v); ok {
			if f > maxVal {
				maxVal = f
			}
			found = true
		}
	}
	if !found {
		return nil
	}
	return maxVal
}

AggMax finds the maximum numeric value. Non-numeric values (including nil, strings, bools) are silently ignored. Returns nil if no numeric values exist. Otherwise returns float64.

var AggMean AggFunc = func(values []any) any {
	sum := 0.0
	count := 0
	for _, v := range values {
		if f, ok := toFloat64(v); ok {
			sum += f
			count++
		}
	}
	if count == 0 {
		return nil
	}
	return sum / float64(count)
}

AggMean calculates the arithmetic mean of numeric values. Non-numeric values (including nil, strings, bools) are silently ignored. Returns nil if no numeric values exist. Otherwise returns float64.

var AggMin AggFunc = func(values []any) any {
	minVal := math.MaxFloat64
	found := false
	for _, v := range values {
		if f, ok := toFloat64(v); ok {
			if f < minVal {
				minVal = f
			}
			found = true
		}
	}
	if !found {
		return nil
	}
	return minVal
}

AggMin finds the minimum numeric value. Non-numeric values (including nil, strings, bools) are silently ignored. Returns nil if no numeric values exist. Otherwise returns float64.

var AggSum AggFunc = func(values []any) any {
	sum := 0.0
	for _, v := range values {
		if f, ok := toFloat64(v); ok {
			sum += f
		}
	}
	return sum
}

AggSum calculates the sum of numeric values. Non-numeric values (including nil, strings, bools) are silently ignored. Returns 0.0 if all values are non-numeric. Always returns float64.

type DataFrame

type DataFrame struct {
	// contains filtered or unexported fields
}

DataFrame is a simple representation of tabular data. It stores data in row-oriented format with immediate execution (no lazy evaluation).

func ConcatAll

func ConcatAll(frames ...*DataFrame) (*DataFrame, error)

ConcatAll concatenates multiple DataFrames vertically, automatically handling different column sets by taking the union of all columns. This is a standalone function (not a method) that accepts any number of DataFrames.

Column Handling:

  • Columns from all DataFrames are collected into a union set
  • Columns are sorted alphabetically for deterministic output
  • Missing values in rows are set to nil

Nil DataFrames are silently skipped, making this safe for optional data.

Use Cases:

  • Combining data from different sources with overlapping schemas
  • Merging datasets that evolved over time with different columns
  • Appending new data with additional fields to existing data

Example - Combining data with different schemas:

users := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"name": "Alice", "age": 30},
})
contacts := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"name": "Bob", "email": "bob@example.com"},
})
result, err := fileframe.ConcatAll(users, contacts)
// Result columns: ["age", "email", "name"] (sorted alphabetically)
// Alice has nil for email, Bob has nil for age

Example - Combining CSV and TSV data:

csv, _ := fileframe.NewDataFrameFromPath("users.csv")
tsv, _ := fileframe.NewDataFrameFromPath("extra_info.tsv")
combined, err := fileframe.ConcatAll(csv, tsv)

func NewDataFrame

func NewDataFrame(reader io.Reader, fileType FileType) (*DataFrame, error)

NewDataFrame creates a DataFrame from an io.Reader. It supports CSV, TSV, LTSV, XLSX, and Parquet formats.

Example:

f, _ := os.Open("data.csv")
defer f.Close()
df, err := fileframe.NewDataFrame(f, fileframe.CSV)

func NewDataFrameFromPath

func NewDataFrameFromPath(path string) (*DataFrame, error)

NewDataFrameFromPath creates a DataFrame from a file path. It automatically detects the file type and handles compressed files (gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4).

Supported formats: CSV, TSV, LTSV, XLSX, Parquet, and their compressed variants. For XLSX files with multiple sheets, the first sheet is used.

Example:

df, err := fileframe.NewDataFrameFromPath("data.csv.gz")
df, err := fileframe.NewDataFrameFromPath("data.csv.snappy")

func NewDataFrameFromRecords

func NewDataFrameFromRecords(records []map[string]any) *DataFrame

NewDataFrameFromRecords creates a DataFrame from a slice of maps. Each map represents a row with column names as keys. Column order is determined by processing records in order, and within each record, keys are sorted alphabetically. New columns are appended as they are encountered.

Example:

records := []map[string]any{
    {"name": "Alice", "age": 30},
    {"name": "Bob", "age": 25},
}
df := fileframe.NewDataFrameFromRecords(records)

func (*DataFrame) Columns

func (df *DataFrame) Columns() []string

Columns returns a copy of the column names.

func (*DataFrame) Concat

func (df *DataFrame) Concat(others ...*DataFrame) (*DataFrame, error)

Concat concatenates multiple DataFrames vertically (row-wise). This is useful for combining data from multiple sources with the same schema.

Requirements:

  • All DataFrames must have exactly the same columns in the same order
  • If columns differ, use ConcatAll instead for flexible concatenation

Returns an error if:

  • Any DataFrame is nil
  • Columns don't match (different names or different order)

Example - Combining monthly data:

jan := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"month": "Jan", "sales": 100},
})
feb := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"month": "Feb", "sales": 150},
})
mar := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"month": "Mar", "sales": 200},
})
quarterly, err := jan.Concat(feb, mar)
// Result: 3 rows with all monthly data

Example - Combining data from multiple CSV files:

df1, _ := fileframe.NewDataFrameFromPath("data_2024_01.csv")
df2, _ := fileframe.NewDataFrameFromPath("data_2024_02.csv")
combined, err := df1.Concat(df2)

func (*DataFrame) Distinct

func (df *DataFrame) Distinct() *DataFrame

Distinct returns a new DataFrame with duplicate rows removed. Two rows are considered duplicates if all their column values are equal.

Example:

unique := df.Distinct()

func (*DataFrame) DistinctBy

func (df *DataFrame) DistinctBy(columns ...string) *DataFrame

DistinctBy returns a new DataFrame with duplicate rows removed based on the specified columns only.

Example:

unique := df.DistinctBy("name", "email")

func (*DataFrame) Drop

func (df *DataFrame) Drop(columns ...string) *DataFrame

Drop returns a new DataFrame with the specified columns removed. Columns that do not exist are silently ignored.

Example:

dropped := df.Drop("temp_col", "debug_col")

func (*DataFrame) DropNA

func (df *DataFrame) DropNA() *DataFrame

DropNA returns a new DataFrame with rows containing nil values removed. By default, removes rows where any column has a nil value.

Example:

cleaned := df.DropNA()

func (*DataFrame) DropNASubset

func (df *DataFrame) DropNASubset(columns ...string) *DataFrame

DropNASubset returns a new DataFrame with rows removed where any of the specified columns have nil values.

Example:

cleaned := df.DropNASubset("required_field1", "required_field2")

func (*DataFrame) FillNA

func (df *DataFrame) FillNA(value any) *DataFrame

FillNA returns a new DataFrame with nil values replaced by the specified value.

Example:

filled := df.FillNA(0)  // Replace all nil with 0

func (*DataFrame) FillNAByColumn

func (df *DataFrame) FillNAByColumn(values map[string]any) *DataFrame

FillNAByColumn returns a new DataFrame with nil values replaced by column-specific values. Columns not in the map retain their nil values.

Example:

filled := df.FillNAByColumn(map[string]any{
    "age":    0,
    "name":   "Unknown",
    "active": false,
})

func (*DataFrame) Filter

func (df *DataFrame) Filter(fn func(row map[string]any) bool) *DataFrame

Filter returns a new DataFrame containing only rows that satisfy the predicate. The predicate function receives a copy of each row to prevent accidental mutation of the original DataFrame.

Example:

filtered := df.Filter(func(row map[string]any) bool {
    age, ok := row["age"].(int64)
    return ok && age >= 18
})

func (*DataFrame) GroupBy

func (df *DataFrame) GroupBy(columns ...string) (*GroupedDataFrame, error)

GroupBy groups the DataFrame by the specified columns. Returns a GroupedDataFrame that can be used with aggregation functions. Returns an error if any of the specified columns do not exist in the DataFrame.

Example:

grouped, err := df.GroupBy("category")
if err != nil {
    log.Fatal(err)
}
result := grouped.Sum("amount")

func (*DataFrame) Head

func (df *DataFrame) Head(n int) *DataFrame

Head returns a new DataFrame with the first n rows. If n is greater than the number of rows, all rows are returned. If n is negative, returns an empty DataFrame.

Example:

first10 := df.Head(10)

func (*DataFrame) Join

func (df *DataFrame) Join(other *DataFrame, opt JoinOption) (*DataFrame, error)

Join combines two DataFrames based on a common column or column pair. This method enables SQL-like join operations between DataFrames.

Join Types:

  • InnerJoin: Returns only matching rows from both DataFrames
  • LeftJoin: Returns all left rows, with nil for unmatched right columns
  • RightJoin: Returns all right rows, with nil for unmatched left columns
  • OuterJoin: Returns all rows from both, with nil for unmatched columns

Column Handling:

  • The join column from the right DataFrame is excluded from the result
  • Conflicting column names are prefixed with "right_"
  • Result column order: left columns first, then right columns

Limitations:

  • Currently supports joining on a single column pair (1 or 2 columns in On)
  • For complex joins with multiple keys, consider using filesql

Example - Inner Join with same column name:

users := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"id": 1, "name": "Alice"},
    {"id": 2, "name": "Bob"},
})
orders := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"id": 1, "product": "Laptop"},
    {"id": 1, "product": "Mouse"},
})
result, err := users.Join(orders, fileframe.JoinOption{
    On:  []string{"id"},
    How: fileframe.InnerJoin,
})
// Result: [{id:1, name:Alice, product:Laptop}, {id:1, name:Alice, product:Mouse}]

Example - Left Join with different column names:

users := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"user_id": 1, "name": "Alice"},
    {"user_id": 2, "name": "Bob"},
    {"user_id": 3, "name": "Charlie"},
})
orders := fileframe.NewDataFrameFromRecords([]map[string]any{
    {"customer_id": 1, "product": "Laptop"},
})
result, err := users.Join(orders, fileframe.JoinOption{
    On:  []string{"user_id", "customer_id"},
    How: fileframe.LeftJoin,
})
// Result includes all 3 users; Bob and Charlie have nil for product

func (*DataFrame) Len

func (df *DataFrame) Len() int

Len returns the number of rows in the DataFrame.

func (*DataFrame) Limit

func (df *DataFrame) Limit(n int) *DataFrame

Limit is an alias for Head. Returns a new DataFrame with the first n rows.

Example:

limited := df.Limit(100)

func (*DataFrame) Mutate

func (df *DataFrame) Mutate(column string, fn func(row map[string]any) any) *DataFrame

Mutate returns a new DataFrame with a new or modified column. The function receives a copy of each row and returns the value for the new column. The original DataFrame is not modified.

If the column name is empty or the function is nil, Mutate returns a clone of the original DataFrame without any modifications.

Example:

mutated := df.Mutate("full_name", func(row map[string]any) any {
    first := row["first_name"].(string)
    last := row["last_name"].(string)
    return first + " " + last
})

func (*DataFrame) Rename

func (df *DataFrame) Rename(oldName, newName string) (*DataFrame, error)

Rename returns a new DataFrame with the specified column renamed. Returns an error if the old column does not exist or if the new column name already exists.

Example:

renamed, err := df.Rename("old_name", "new_name")

func (*DataFrame) RenameColumns

func (df *DataFrame) RenameColumns(renames map[string]string) (*DataFrame, error)

RenameColumns returns a new DataFrame with multiple columns renamed. The renames map specifies old name -> new name mappings. Returns an error if any old column does not exist or if any new name conflicts.

Example:

renamed, err := df.RenameColumns(map[string]string{
    "col1": "column_one",
    "col2": "column_two",
})

func (*DataFrame) Select

func (df *DataFrame) Select(columns ...string) *DataFrame

Select returns a new DataFrame with only the specified columns. Columns that do not exist are silently ignored.

Example:

selected := df.Select("name", "age")

func (*DataFrame) Sort

func (df *DataFrame) Sort(column string, order SortOrder) (*DataFrame, error)

Sort returns a new DataFrame sorted by the specified column. Supports sorting by string, int64, and float64 values. Nil values are placed at the end regardless of sort order.

Example:

sorted := df.Sort("age", fileframe.Ascending)

func (*DataFrame) SortBy

func (df *DataFrame) SortBy(options ...SortOption) (*DataFrame, error)

SortBy returns a new DataFrame sorted by multiple columns. Columns are sorted in the order specified (first column has highest priority).

Example:

sorted, err := df.SortBy(
    fileframe.SortOption{Column: "category", Order: fileframe.Ascending},
    fileframe.SortOption{Column: "price", Order: fileframe.Descending},
)

func (*DataFrame) Tail

func (df *DataFrame) Tail(n int) *DataFrame

Tail returns a new DataFrame with the last n rows. If n is greater than the number of rows, all rows are returned. If n is negative, returns an empty DataFrame.

Example:

last10 := df.Tail(10)

func (*DataFrame) ToCSV

func (df *DataFrame) ToCSV(path string) error

ToCSV writes the DataFrame to a CSV file.

Example:

err := df.ToCSV("output.csv")

func (*DataFrame) ToRecords

func (df *DataFrame) ToRecords() []map[string]any

ToRecords returns the data as a slice of maps. Each map is a copy to ensure immutability.

func (*DataFrame) ToTSV

func (df *DataFrame) ToTSV(path string) error

ToTSV writes the DataFrame to a TSV file.

Example:

err := df.ToTSV("output.tsv")

type FileType

type FileType = fileparser.FileType

FileType represents supported file types including compression variants. This is an alias for fileparser.FileType.

type GroupedDataFrame

type GroupedDataFrame struct {
	// contains filtered or unexported fields
}

GroupedDataFrame represents a DataFrame grouped by one or more columns.

func (*GroupedDataFrame) Agg

func (gdf *GroupedDataFrame) Agg(column string, fn AggFunc) (*DataFrame, error)

Agg performs a custom aggregation on the specified column. The result column is named "agg_{column}". Returns an error if the specified column does not exist in the DataFrame.

Example:

median, err := grouped.Agg("amount", func(values []any) any {
    sorted := sortValues(values)
    return sorted[len(sorted)/2]
})

func (*GroupedDataFrame) Count

func (gdf *GroupedDataFrame) Count() *DataFrame

Count returns a DataFrame with the count of rows in each group. The result column is named "count".

Example:

counts := df.GroupBy("category").Count()

func (*GroupedDataFrame) Max

func (gdf *GroupedDataFrame) Max(column string) (*DataFrame, error)

Max returns a DataFrame with the maximum value in the specified column for each group. The result column is named "max_{column}". Returns an error if the specified column does not exist in the DataFrame.

Example:

maximums, err := df.GroupBy("category").Max("amount")

func (*GroupedDataFrame) Mean

func (gdf *GroupedDataFrame) Mean(column string) (*DataFrame, error)

Mean returns a DataFrame with the mean of values in the specified column for each group. The result column is named "mean_{column}". Returns an error if the specified column does not exist in the DataFrame.

Example:

averages, err := df.GroupBy("category").Mean("amount")

func (*GroupedDataFrame) Min

func (gdf *GroupedDataFrame) Min(column string) (*DataFrame, error)

Min returns a DataFrame with the minimum value in the specified column for each group. The result column is named "min_{column}". Returns an error if the specified column does not exist in the DataFrame.

Example:

minimums, err := df.GroupBy("category").Min("amount")

func (*GroupedDataFrame) Sum

func (gdf *GroupedDataFrame) Sum(column string) (*DataFrame, error)

Sum returns a DataFrame with the sum of values in the specified column for each group. The result column is named "sum_{column}". Returns an error if the specified column does not exist in the DataFrame.

Example:

totals, err := df.GroupBy("category").Sum("amount")

type JoinOption

type JoinOption struct {
	// On specifies the column(s) to join on.
	// If one column is specified, it is used for both DataFrames.
	// If two columns are specified, the first is for the left DataFrame and the second for the right.
	On []string
	// How specifies the type of join (InnerJoin, LeftJoin, RightJoin, OuterJoin).
	How JoinType
}

JoinOption specifies options for the Join operation.

On field specifies the join column(s):

  • One column: Used for both DataFrames (e.g., On: []string{"id"})
  • Two columns: First for left DataFrame, second for right (e.g., On: []string{"id", "user_id"})

How field specifies the join type (InnerJoin, LeftJoin, RightJoin, OuterJoin).

Example:

// Same column name in both DataFrames
opt := fileframe.JoinOption{On: []string{"id"}, How: fileframe.InnerJoin}

// Different column names
opt := fileframe.JoinOption{On: []string{"id", "user_id"}, How: fileframe.LeftJoin}

type JoinType

type JoinType int

JoinType represents the type of join operation. Four join types are supported: InnerJoin, LeftJoin, RightJoin, and OuterJoin.

const (
	// InnerJoin returns only rows that have matching values in both DataFrames.
	// This is the most restrictive join type - rows without matches are excluded.
	//
	// Example: If users has ids [1, 2, 3] and orders has user_ids [1, 2, 4],
	// an inner join returns only rows for users 1 and 2.
	InnerJoin JoinType = iota

	// LeftJoin returns all rows from the left DataFrame and matched rows from the right DataFrame.
	// For left rows without matches, the right columns will have nil values.
	//
	// Example: If users has ids [1, 2, 3] and orders has user_ids [1, 2],
	// a left join returns all 3 users, with user 3 having nil for order columns.
	LeftJoin

	// RightJoin returns all rows from the right DataFrame and matched rows from the left DataFrame.
	// For right rows without matches, the left columns will have nil values.
	//
	// Example: If users has ids [1, 2] and orders has user_ids [1, 2, 3],
	// a right join returns all 3 orders, with order 3 having nil for user columns.
	RightJoin

	// OuterJoin returns all rows from both DataFrames.
	// Unmatched rows will have nil values for columns from the other DataFrame.
	// This is the most inclusive join type - no rows are excluded.
	//
	// Example: If users has ids [1, 2] and orders has user_ids [2, 3],
	// an outer join returns users 1, 2 and orders 2, 3 (4 rows total).
	OuterJoin
)

type SortOption

type SortOption struct {
	// Column is the column name to sort by.
	Column string
	// Order specifies ascending or descending sort order.
	Order SortOrder
}

SortOption specifies options for the Sort operation.

type SortOrder

type SortOrder int

SortOrder specifies the order for sorting.

const (
	// Ascending sorts values from smallest to largest.
	Ascending SortOrder = iota
	// Descending sorts values from largest to smallest.
	Descending
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL