Documentation
¶
Overview ¶
Package fileframe provides a lightweight table utility that bridges fileprep and filesql.
fileframe is not a degraded copy of Pandas, but a practical tabular data manipulation tool that is idiomatic to Go. It follows the UNIX philosophy of doing one thing well.
Design Philosophy ¶
- Small: Do one thing well (UNIX philosophy)
- Practical: Only features used in real data analysis
- Simple and clear: API is self-explanatory
- Intuitive: Natural Go-like coding style
- Extensible: Complex features delegated to filesql
Basic Usage ¶
// Create DataFrame from CSV
f, _ := os.Open("sales.csv")
defer f.Close()
df, err := fileframe.NewDataFrame(f, fileframe.CSV)
if err != nil {
log.Fatal(err)
}
// Select columns and filter rows
result := df.
Select("product", "amount", "category").
Filter(func(row map[string]any) bool {
amount, ok := row["amount"].(float64)
return ok && amount > 1000
})
// Group by and aggregate
grouped := result.
GroupBy("category").
Sum("amount")
// Output to CSV
grouped.ToCSV("summary.csv")
Architecture ¶
fileframe sits between fileprep (preprocessing) and filesql (persistence/SQL):
- Receives output from fileprep (io.Reader -> DataFrame)
- Performs basic transformations (Select, Filter, Mutate, GroupBy)
- Outputs to CSV or passes to filesql
For complex operations like Window functions, subqueries, or large-scale data processing, use filesql directly.
Important Notes ¶
- All operations execute immediately (no lazy evaluation)
- Target scale: Small to medium data (under 100,000 rows)
- Row-oriented design with []map[string]any
- All methods return new DataFrames (immutable operations)
Example ¶
Example demonstrates basic DataFrame operations: reading CSV, filtering rows, grouping, and aggregating data.
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
// Sample sales data
csvData := `product,amount,category
Apple,100,Fruit
Banana,150,Fruit
Carrot,80,Vegetable
Orange,120,Fruit
Broccoli,90,Vegetable`
// Create DataFrame from CSV
df, err := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV)
if err != nil {
fmt.Println("Error:", err)
return
}
fmt.Printf("Total rows: %d\n", df.Len())
fmt.Printf("Columns: %v\n", df.Columns())
// Filter: only items with amount > 100
filtered := df.Filter(func(row map[string]any) bool {
amount, ok := row["amount"].(int64)
return ok && amount > 100
})
fmt.Printf("Rows with amount > 100: %d\n", filtered.Len())
// GroupBy category and sum amounts
groupedDf, err := df.GroupBy("category")
if err != nil {
fmt.Println("Error:", err)
return
}
grouped, err := groupedDf.Sum("amount")
if err != nil {
fmt.Println("Error:", err)
return
}
fmt.Printf("Grouped columns: %v\n", grouped.Columns())
// Show grouped results
for _, row := range grouped.ToRecords() {
fmt.Printf(" %s: %.0f\n", row["category"], row["sum_amount"])
}
}
Output: Total rows: 5 Columns: [product amount category] Rows with amount > 100: 2 Grouped columns: [category sum_amount] Fruit: 370 Vegetable: 170
Example (ComplexOperations) ¶
Example_complexOperations demonstrates advanced DataFrame operations including multiple aggregations, data transformation with Mutate, and combining results from different DataFrames.
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
// Sales data
salesCSV := `order_id,product_id,quantity,unit_price
1,P001,2,100
2,P002,1,200
3,P001,3,100
4,P003,5,50
5,P002,2,200`
// Product master data
productsCSV := `product_id,name,category
P001,Laptop Stand,Electronics
P002,Mechanical Keyboard,Electronics
P003,Notebook,Stationery`
// Create DataFrames
sales, _ := fileframe.NewDataFrame(strings.NewReader(salesCSV), fileframe.CSV) //nolint:errcheck // example code
products, _ := fileframe.NewDataFrame(strings.NewReader(productsCSV), fileframe.CSV) //nolint:errcheck // example code
// Add calculated column: total_amount = quantity * unit_price
salesWithTotal := sales.Mutate("total_amount", func(row map[string]any) any {
qty, _ := row["quantity"].(int64) //nolint:errcheck // example code
price, _ := row["unit_price"].(int64) //nolint:errcheck // example code
return qty * price
})
fmt.Println("=== Sales with Total Amount ===")
for _, row := range salesWithTotal.ToRecords() {
fmt.Printf("Order %v: %v x %v = %v\n",
row["order_id"], row["quantity"], row["unit_price"], row["total_amount"])
}
// Aggregate sales by product_id
salesByProductGrp, _ := salesWithTotal.GroupBy("product_id") //nolint:errcheck // example code
salesByProduct, _ := salesByProductGrp.Sum("total_amount") //nolint:errcheck // example code
fmt.Println("\n=== Sales by Product ===")
for _, row := range salesByProduct.ToRecords() {
fmt.Printf("%s: %.0f\n", row["product_id"], row["sum_total_amount"])
}
// Create a lookup map from products DataFrame
productLookup := make(map[string]map[string]any)
for _, row := range products.ToRecords() {
pid, _ := row["product_id"].(string) //nolint:errcheck // example code
productLookup[pid] = row
}
// Combine sales summary with product info (manual join)
combinedRecords := make([]map[string]any, 0)
for _, salesRow := range salesByProduct.ToRecords() {
pid, _ := salesRow["product_id"].(string) //nolint:errcheck // example code
if productInfo, exists := productLookup[pid]; exists {
combined := map[string]any{
"product_id": pid,
"name": productInfo["name"],
"category": productInfo["category"],
"total_sales": salesRow["sum_total_amount"],
}
combinedRecords = append(combinedRecords, combined)
}
}
combined := fileframe.NewDataFrameFromRecords(combinedRecords)
fmt.Println("\n=== Combined Sales Report ===")
fmt.Printf("Columns: %v\n", combined.Columns())
for _, row := range combined.ToRecords() {
fmt.Printf("%s (%s): %.0f\n",
row["name"], row["category"], row["total_sales"])
}
// Group combined data by category
byCategoryGrp, _ := combined.GroupBy("category") //nolint:errcheck // example code
byCategory, _ := byCategoryGrp.Sum("total_sales") //nolint:errcheck // example code
fmt.Println("\n=== Total Sales by Category ===")
for _, row := range byCategory.ToRecords() {
fmt.Printf("%s: %.0f\n", row["category"], row["sum_total_sales"])
}
// Calculate statistics
fmt.Println("\n=== Sales Statistics ===")
statsGrp, _ := salesWithTotal.GroupBy() //nolint:errcheck // example code
stats, _ := statsGrp.Sum("total_amount") //nolint:errcheck // example code
for _, row := range stats.ToRecords() {
fmt.Printf("Total Revenue: %.0f\n", row["sum_total_amount"])
}
meanSalesGrp, _ := salesWithTotal.GroupBy() //nolint:errcheck // example code
meanSales, _ := meanSalesGrp.Mean("total_amount") //nolint:errcheck // example code
for _, row := range meanSales.ToRecords() {
fmt.Printf("Average Order Value: %.0f\n", row["mean_total_amount"])
}
minSalesGrp, _ := salesWithTotal.GroupBy() //nolint:errcheck // example code
minSales, _ := minSalesGrp.Min("total_amount") //nolint:errcheck // example code
for _, row := range minSales.ToRecords() {
fmt.Printf("Min Order: %.0f\n", row["min_total_amount"])
}
maxSalesGrp, _ := salesWithTotal.GroupBy() //nolint:errcheck // example code
maxSales, _ := maxSalesGrp.Max("total_amount") //nolint:errcheck // example code
for _, row := range maxSales.ToRecords() {
fmt.Printf("Max Order: %.0f\n", row["max_total_amount"])
}
}
Output: === Sales with Total Amount === Order 1: 2 x 100 = 200 Order 2: 1 x 200 = 200 Order 3: 3 x 100 = 300 Order 4: 5 x 50 = 250 Order 5: 2 x 200 = 400 === Sales by Product === P001: 500 P002: 600 P003: 250 === Combined Sales Report === Columns: [category name product_id total_sales] Laptop Stand (Electronics): 500 Mechanical Keyboard (Electronics): 600 Notebook (Stationery): 250 === Total Sales by Category === Electronics: 1100 Stationery: 250 === Sales Statistics === Total Revenue: 1350 Average Order Value: 270 Min Order: 200 Max Order: 400
Example (Concat) ¶
Example_concat demonstrates vertical concatenation of DataFrames with the same schema. Use Concat when combining data from multiple sources with identical columns.
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
// Sales data from different regions (same schema)
tokyoCSV := `region,product,sales
Tokyo,Laptop,100
Tokyo,Mouse,300`
osakaCSV := `region,product,sales
Osaka,Laptop,80
Osaka,Mouse,250
Osaka,Keyboard,120`
tokyo, _ := fileframe.NewDataFrame(strings.NewReader(tokyoCSV), fileframe.CSV) //nolint:errcheck
osaka, _ := fileframe.NewDataFrame(strings.NewReader(osakaCSV), fileframe.CSV) //nolint:errcheck
// Concat requires identical columns
combined, err := tokyo.Concat(osaka)
if err != nil {
fmt.Println("Error:", err)
return
}
fmt.Printf("Tokyo rows: %d\n", tokyo.Len())
fmt.Printf("Osaka rows: %d\n", osaka.Len())
fmt.Printf("Combined rows: %d\n", combined.Len())
// Now we can analyze the combined data
grouped, _ := combined.GroupBy("product") //nolint:errcheck
totals, _ := grouped.Sum("sales") //nolint:errcheck
// Sort by product name for deterministic output
sortedTotals, _ := totals.Sort("product", fileframe.Ascending) //nolint:errcheck
fmt.Println("\nTotal sales by product:")
for _, row := range sortedTotals.ToRecords() {
fmt.Printf(" %s: %.0f\n", row["product"], row["sum_sales"])
}
}
Output: Tokyo rows: 2 Osaka rows: 3 Combined rows: 5 Total sales by product: Keyboard: 120 Laptop: 180 Mouse: 550
Example (ConcatAll) ¶
Example_concatAll demonstrates flexible concatenation of DataFrames with different schemas. ConcatAll automatically handles different column sets by creating a union of all columns.
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
// Data from 2023 - basic schema
data2023CSV := `year,product,sales
2023,Laptop,1000
2023,Mouse,500`
// Data from 2024 - added "region" column
data2024CSV := `year,product,sales,region
2024,Laptop,1200,Tokyo
2024,Mouse,600,Osaka
2024,Keyboard,300,Tokyo`
df2023, _ := fileframe.NewDataFrame(strings.NewReader(data2023CSV), fileframe.CSV) //nolint:errcheck
df2024, _ := fileframe.NewDataFrame(strings.NewReader(data2024CSV), fileframe.CSV) //nolint:errcheck
fmt.Printf("2023 columns: %v\n", df2023.Columns())
fmt.Printf("2024 columns: %v\n", df2024.Columns())
// ConcatAll handles different schemas
combined, err := fileframe.ConcatAll(df2023, df2024)
if err != nil {
fmt.Println("Error:", err)
return
}
fmt.Printf("Combined columns: %v\n", combined.Columns())
fmt.Printf("Combined rows: %d\n", combined.Len())
// 2023 data will have nil for "region"
fmt.Println("\nCombined data:")
for _, row := range combined.ToRecords() {
region := row["region"]
if region == nil {
region = "(no region)"
}
// Handle int64 vs float64 for sales column
var sales float64
switch v := row["sales"].(type) {
case int64:
sales = float64(v)
case float64:
sales = v
}
fmt.Printf(" %v %s: %.0f - %v\n",
row["year"], row["product"], sales, region)
}
}
Output: 2023 columns: [year product sales] 2024 columns: [year product sales region] Combined columns: [product region sales year] Combined rows: 5 Combined data: 2023 Laptop: 1000 - (no region) 2023 Mouse: 500 - (no region) 2024 Laptop: 1200 - Tokyo 2024 Mouse: 600 - Osaka 2024 Keyboard: 300 - Tokyo
Example (CustomAggregation) ¶
Example_customAggregation demonstrates how to use the Agg function to implement custom aggregation logic such as median calculation.
package main
import (
"fmt"
"slices"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
csvData := `category,value
A,10
A,20
A,30
A,40
A,50
B,5
B,15
B,25`
df, err := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV)
if err != nil {
fmt.Println("Error:", err)
return
}
grouped, err := df.GroupBy("category")
if err != nil {
fmt.Println("Error:", err)
return
}
// Custom aggregation: calculate median
median := func(values []any) any {
// Filter and convert to float64
nums := make([]float64, 0, len(values))
for _, v := range values {
switch n := v.(type) {
case int64:
nums = append(nums, float64(n))
case float64:
nums = append(nums, n)
}
}
if len(nums) == 0 {
return nil
}
// Sort values
slices.Sort(nums)
// Calculate median
mid := len(nums) / 2
if len(nums)%2 == 0 {
return (nums[mid-1] + nums[mid]) / 2
}
return nums[mid]
}
result, _ := grouped.Agg("value", median) //nolint:errcheck // example code
fmt.Println("Median by category:")
for _, row := range result.ToRecords() {
fmt.Printf(" %s: %.1f\n", row["category"], row["agg_value"])
}
// Custom aggregation: calculate range (max - min)
rangeFunc := func(values []any) any {
var minVal, maxVal float64
first := true
for _, v := range values {
var n float64
switch val := v.(type) {
case int64:
n = float64(val)
case float64:
n = val
default:
continue
}
if first {
minVal, maxVal = n, n
first = false
} else {
if n < minVal {
minVal = n
}
if n > maxVal {
maxVal = n
}
}
}
if first {
return nil
}
return maxVal - minVal
}
rangeResult, _ := grouped.Agg("value", rangeFunc) //nolint:errcheck // example code
fmt.Println("Range by category:")
for _, row := range rangeResult.ToRecords() {
fmt.Printf(" %s: %.0f\n", row["category"], row["agg_value"])
}
}
Output: Median by category: A: 30.0 B: 15.0 Range by category: A: 40 B: 20
Example (DataframePipeline) ¶
Example_dataframePipeline demonstrates chaining multiple DataFrame operations to build a complete data processing pipeline.
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
// Raw sales data with some issues
salesCSV := `date,region,product,quantity,price,salesperson
2024-01-15,Tokyo,Laptop,2,1000,Alice
2024-01-15,Tokyo,Mouse,10,25,Alice
2024-01-16,Osaka,Laptop,1,1000,Bob
2024-01-16,Osaka,Keyboard,5,75,
2024-01-17,Tokyo,Monitor,3,300,Charlie
2024-01-17,Nagoya,Mouse,8,25,Diana`
df, _ := fileframe.NewDataFrame(strings.NewReader(salesCSV), fileframe.CSV) //nolint:errcheck
// Pipeline: Clean -> Transform -> Aggregate -> Sort -> Limit
result := df.
// 1. Fill missing salesperson
FillNAByColumn(map[string]any{"salesperson": "Unknown"}).
// 2. Add calculated column
Mutate("revenue", func(row map[string]any) any {
qty, _ := row["quantity"].(int64) //nolint:errcheck
price, _ := row["price"].(int64) //nolint:errcheck
return float64(qty) * float64(price)
}).
// 3. Select relevant columns
Select("region", "product", "revenue", "salesperson")
// Group by region and sum revenue
grouped, _ := result.GroupBy("region") //nolint:errcheck
byRegion, _ := grouped.Sum("revenue") //nolint:errcheck
// Sort by revenue descending
sorted, _ := byRegion.Sort("sum_revenue", fileframe.Descending) //nolint:errcheck
fmt.Println("Revenue by Region (Top to Bottom):")
for _, row := range sorted.ToRecords() {
fmt.Printf(" %s: $%.0f\n", row["region"], row["sum_revenue"])
}
// Also get top 3 individual sales
topSales, _ := result.Sort("revenue", fileframe.Descending) //nolint:errcheck
fmt.Println("\nTop 3 Sales:")
for _, row := range topSales.Head(3).ToRecords() {
fmt.Printf(" %s in %s: $%.0f (by %s)\n",
row["product"], row["region"], row["revenue"], row["salesperson"])
}
}
Output: Revenue by Region (Top to Bottom): Tokyo: $3150 Osaka: $1375 Nagoya: $200 Top 3 Sales: Laptop in Tokyo: $2000 (by Alice) Laptop in Osaka: $1000 (by Bob) Monitor in Tokyo: $900 (by Charlie)
Example (DropRename) ¶
Example_dropRename demonstrates column manipulation operations.
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
csvData := `user_id,first_name,last_name,internal_code,email
1,Alice,Smith,X123,alice@example.com
2,Bob,Jones,X456,bob@example.com`
df, _ := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV) //nolint:errcheck
fmt.Printf("Original columns: %v\n", df.Columns())
// Drop internal column
cleaned := df.Drop("internal_code")
fmt.Printf("After Drop: %v\n", cleaned.Columns())
// Rename columns for clarity
renamed, _ := cleaned.RenameColumns(map[string]string{ //nolint:errcheck
"first_name": "first",
"last_name": "last",
})
fmt.Printf("After Rename: %v\n", renamed.Columns())
// Single column rename
final, _ := renamed.Rename("user_id", "id") //nolint:errcheck
fmt.Printf("Final columns: %v\n", final.Columns())
}
Output: Original columns: [user_id first_name last_name internal_code email] After Drop: [user_id first_name last_name email] After Rename: [user_id first last email] Final columns: [id first last email]
Example (FileFormats) ¶
Example_fileFormats shows the various file formats supported by fileframe. NewDataFrameFromPath automatically detects file type and handles compression.
Supported formats:
- CSV, TSV, LTSV, XLSX, Parquet
- Compressed variants: .gz, .bz2, .xz, .zst, .z, .snappy, .s2, .lz4
Usage:
// Auto-detect CSV
df, err := fileframe.NewDataFrameFromPath("data.csv")
// Auto-detect compressed CSV (gzip)
df, err := fileframe.NewDataFrameFromPath("data.csv.gz")
// Auto-detect TSV with zstd compression
df, err := fileframe.NewDataFrameFromPath("data.tsv.zst")
// Auto-detect CSV with snappy compression
df, err := fileframe.NewDataFrameFromPath("data.csv.snappy")
// Auto-detect TSV with lz4 compression
df, err := fileframe.NewDataFrameFromPath("data.tsv.lz4")
// Auto-detect Excel file
df, err := fileframe.NewDataFrameFromPath("spreadsheet.xlsx")
// Auto-detect Parquet file
df, err := fileframe.NewDataFrameFromPath("data.parquet")
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
// This example demonstrates the API for reading various file formats.
// Since Example functions require deterministic output, we show
// equivalent operations using NewDataFrame with explicit file types.
// TSV (Tab-Separated Values)
tsvData := "name\tage\tcity\nAlice\t30\tTokyo\nBob\t25\tOsaka"
dfTSV, _ := fileframe.NewDataFrame(strings.NewReader(tsvData), fileframe.TSV) //nolint:errcheck
fmt.Printf("TSV columns: %v, rows: %d\n", dfTSV.Columns(), dfTSV.Len())
// LTSV (Labeled Tab-Separated Values)
ltsvData := "name:Alice\tage:30\tcity:Tokyo\nname:Bob\tage:25\tcity:Osaka"
dfLTSV, _ := fileframe.NewDataFrame(strings.NewReader(ltsvData), fileframe.LTSV) //nolint:errcheck
fmt.Printf("LTSV columns: %v, rows: %d\n", dfLTSV.Columns(), dfLTSV.Len())
// For file-based operations with compression, use NewDataFrameFromPath:
//
// df, err := fileframe.NewDataFrameFromPath("logs.csv.gz") // gzip
// df, err := fileframe.NewDataFrameFromPath("data.tsv.bz2") // bzip2
// df, err := fileframe.NewDataFrameFromPath("export.csv.xz") // xz
// df, err := fileframe.NewDataFrameFromPath("archive.csv.zst") // zstd
// df, err := fileframe.NewDataFrameFromPath("data.csv.z") // zlib
// df, err := fileframe.NewDataFrameFromPath("data.csv.snappy") // snappy
// df, err := fileframe.NewDataFrameFromPath("data.csv.s2") // s2
// df, err := fileframe.NewDataFrameFromPath("data.csv.lz4") // lz4
}
Output: TSV columns: [name age city], rows: 2 LTSV columns: [name age city], rows: 2
Example (GlobalAggregation) ¶
Example_globalAggregation demonstrates how to calculate global statistics without grouping by any column. Call GroupBy() with no arguments to aggregate the entire DataFrame into a single result.
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
csvData := `product,price,quantity
Laptop,1000,5
Mouse,25,50
Keyboard,75,30
Monitor,300,10`
df, err := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV)
if err != nil {
fmt.Println("Error:", err)
return
}
// GroupBy() with no arguments = global aggregation (entire DataFrame as one group)
grouped, err := df.GroupBy()
if err != nil {
fmt.Println("Error:", err)
return
}
// Calculate various statistics for the entire dataset
count := grouped.Count()
fmt.Printf("Total products: %d\n", count.ToRecords()[0]["count"])
sumResult, _ := grouped.Sum("price") //nolint:errcheck // example code
fmt.Printf("Sum of prices: %.0f\n", sumResult.ToRecords()[0]["sum_price"])
meanResult, _ := grouped.Mean("quantity") //nolint:errcheck // example code
fmt.Printf("Average quantity: %.2f\n", meanResult.ToRecords()[0]["mean_quantity"])
minResult, _ := grouped.Min("price") //nolint:errcheck // example code
fmt.Printf("Min price: %.0f\n", minResult.ToRecords()[0]["min_price"])
maxResult, _ := grouped.Max("price") //nolint:errcheck // example code
fmt.Printf("Max price: %.0f\n", maxResult.ToRecords()[0]["max_price"])
}
Output: Total products: 4 Sum of prices: 1400 Average quantity: 23.75 Min price: 25 Max price: 1000
Example (HandleMissingValues) ¶
Example_handleMissingValues demonstrates DropNA and FillNA operations.
package main
import (
"fmt"
"github.com/nao1215/fileframe"
)
func main() {
// Create DataFrame with nil values
records := []map[string]any{
{"name": "Alice", "age": int64(30), "city": "Tokyo"},
{"name": "Bob", "age": nil, "city": "Osaka"},
{"name": nil, "age": int64(25), "city": nil},
{"name": "Diana", "age": int64(35), "city": "Kyoto"},
}
df := fileframe.NewDataFrameFromRecords(records)
fmt.Printf("Original rows: %d\n", df.Len())
// DropNA: Remove rows with any nil values
cleaned := df.DropNA()
fmt.Printf("After DropNA: %d rows\n", cleaned.Len())
// DropNASubset: Remove rows with nil only in specific columns
partialClean := df.DropNASubset("name")
fmt.Printf("After DropNASubset(name): %d rows\n", partialClean.Len())
// FillNA: Replace all nil values with a default
filled := df.FillNA("Unknown")
fmt.Println("\nAfter FillNA('Unknown'):")
for _, row := range filled.ToRecords() {
fmt.Printf(" %v, %v, %v\n", row["name"], row["age"], row["city"])
}
// FillNAByColumn: Different defaults per column
smartFilled := df.FillNAByColumn(map[string]any{
"name": "Anonymous",
"age": int64(0),
"city": "Unknown",
})
fmt.Println("\nAfter FillNAByColumn:")
for _, row := range smartFilled.ToRecords() {
fmt.Printf(" %v, %v, %v\n", row["name"], row["age"], row["city"])
}
}
Output: Original rows: 4 After DropNA: 2 rows After DropNASubset(name): 3 rows After FillNA('Unknown'): Alice, 30, Tokyo Bob, Unknown, Osaka Unknown, 25, Unknown Diana, 35, Kyoto After FillNAByColumn: Alice, 30, Tokyo Bob, 0, Osaka Anonymous, 25, Unknown Diana, 35, Kyoto
Example (HeadTailLimit) ¶
Example_headTailLimit demonstrates row selection operations.
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
csvData := `id,value
1,100
2,200
3,300
4,400
5,500
6,600
7,700`
df, _ := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV) //nolint:errcheck
fmt.Printf("Total rows: %d\n", df.Len())
// Get first 3 rows
head := df.Head(3)
fmt.Printf("\nHead(3) - first 3 rows:\n")
for _, row := range head.ToRecords() {
fmt.Printf(" id=%v, value=%v\n", row["id"], row["value"])
}
// Get last 2 rows
tail := df.Tail(2)
fmt.Printf("\nTail(2) - last 2 rows:\n")
for _, row := range tail.ToRecords() {
fmt.Printf(" id=%v, value=%v\n", row["id"], row["value"])
}
// Limit is alias for Head - useful for SQL-like syntax
limited := df.Limit(2)
fmt.Printf("\nLimit(2) rows: %d\n", limited.Len())
}
Output: Total rows: 7 Head(3) - first 3 rows: id=1, value=100 id=2, value=200 id=3, value=300 Tail(2) - last 2 rows: id=6, value=600 id=7, value=700 Limit(2) rows: 2
Example (Join) ¶
Example_join demonstrates how to combine two DataFrames using Join. This is similar to SQL JOIN operations and supports inner, left, right, and outer joins.
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
// Users table
usersCSV := `id,name,department
1,Alice,Engineering
2,Bob,Marketing
3,Charlie,Engineering
4,Diana,Sales`
// Orders table - note that user_id references users.id
ordersCSV := `order_id,user_id,product,amount
101,1,Laptop,1200
102,1,Mouse,50
103,2,Monitor,400
104,5,Keyboard,100`
users, _ := fileframe.NewDataFrame(strings.NewReader(usersCSV), fileframe.CSV) //nolint:errcheck
orders, _ := fileframe.NewDataFrame(strings.NewReader(ordersCSV), fileframe.CSV) //nolint:errcheck
// Inner Join: Only users who have orders
inner, _ := users.Join(orders, fileframe.JoinOption{ //nolint:errcheck
On: []string{"id", "user_id"}, // Left column, Right column
How: fileframe.InnerJoin,
})
fmt.Println("=== Inner Join (users with orders) ===")
fmt.Printf("Rows: %d\n", inner.Len())
for _, row := range inner.ToRecords() {
fmt.Printf(" %s ordered %s ($%v)\n", row["name"], row["product"], row["amount"])
}
// Left Join: All users, with order info if available
left, _ := users.Join(orders, fileframe.JoinOption{ //nolint:errcheck
On: []string{"id", "user_id"},
How: fileframe.LeftJoin,
})
fmt.Println("\n=== Left Join (all users) ===")
fmt.Printf("Rows: %d\n", left.Len())
// Count users without orders
noOrders := 0
for _, row := range left.ToRecords() {
if row["order_id"] == nil {
noOrders++
}
}
fmt.Printf("Users without orders: %d\n", noOrders)
}
Output: === Inner Join (users with orders) === Rows: 3 Alice ordered Laptop ($1200) Alice ordered Mouse ($50) Bob ordered Monitor ($400) === Left Join (all users) === Rows: 5 Users without orders: 2
Example (JoinTypes) ¶
Example_joinTypes demonstrates all four join types: InnerJoin, LeftJoin, RightJoin, and OuterJoin.
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
// Left DataFrame: Products
productsCSV := `product_id,name
P1,Laptop
P2,Mouse
P3,Keyboard`
// Right DataFrame: Inventory
inventoryCSV := `item_id,quantity,warehouse
P1,50,Tokyo
P2,200,Osaka
P4,30,Tokyo`
products, _ := fileframe.NewDataFrame(strings.NewReader(productsCSV), fileframe.CSV) //nolint:errcheck
inventory, _ := fileframe.NewDataFrame(strings.NewReader(inventoryCSV), fileframe.CSV) //nolint:errcheck
// Inner Join: Only products in inventory
inner, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
On: []string{"product_id", "item_id"},
How: fileframe.InnerJoin,
})
fmt.Printf("Inner Join: %d rows (products in inventory)\n", inner.Len())
// Left Join: All products, with inventory if exists
left, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
On: []string{"product_id", "item_id"},
How: fileframe.LeftJoin,
})
fmt.Printf("Left Join: %d rows (all products)\n", left.Len())
// Right Join: All inventory items, with product info if exists
right, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
On: []string{"product_id", "item_id"},
How: fileframe.RightJoin,
})
fmt.Printf("Right Join: %d rows (all inventory)\n", right.Len())
// Outer Join: Everything from both
outer, _ := products.Join(inventory, fileframe.JoinOption{ //nolint:errcheck
On: []string{"product_id", "item_id"},
How: fileframe.OuterJoin,
})
fmt.Printf("Outer Join: %d rows (all products + all inventory)\n", outer.Len())
}
Output: Inner Join: 2 rows (products in inventory) Left Join: 3 rows (all products) Right Join: 3 rows (all inventory) Outer Join: 4 rows (all products + all inventory)
Example (SortAndDistinct) ¶
Example_sortAndDistinct demonstrates sorting and deduplication operations.
package main
import (
"fmt"
"strings"
"github.com/nao1215/fileframe"
)
func main() {
csvData := `name,category,score
Alice,A,85
Bob,B,90
Charlie,A,85
Alice,A,85
Diana,B,75
Eve,A,95`
df, _ := fileframe.NewDataFrame(strings.NewReader(csvData), fileframe.CSV) //nolint:errcheck
fmt.Printf("Original rows: %d\n", df.Len())
// Remove duplicate rows
unique := df.Distinct()
fmt.Printf("After Distinct: %d rows\n", unique.Len())
// Sort by score descending
sorted, _ := unique.Sort("score", fileframe.Descending) //nolint:errcheck
fmt.Println("\nTop scores:")
for _, row := range sorted.Head(3).ToRecords() {
fmt.Printf(" %s: %v\n", row["name"], row["score"])
}
// Sort by multiple columns: category ascending, then score descending
multiSorted, _ := unique.SortBy( //nolint:errcheck
fileframe.SortOption{Column: "category", Order: fileframe.Ascending},
fileframe.SortOption{Column: "score", Order: fileframe.Descending},
)
fmt.Println("\nBy category, then score:")
for _, row := range multiSorted.ToRecords() {
fmt.Printf(" [%s] %s: %v\n", row["category"], row["name"], row["score"])
}
}
Output: Original rows: 6 After Distinct: 5 rows Top scores: Eve: 95 Bob: 90 Alice: 85 By category, then score: [A] Eve: 95 [A] Alice: 85 [A] Charlie: 85 [B] Bob: 90 [B] Diana: 75
Index ¶
- Constants
- Variables
- type AggFunc
- type DataFrame
- func (df *DataFrame) Columns() []string
- func (df *DataFrame) Concat(others ...*DataFrame) (*DataFrame, error)
- func (df *DataFrame) Distinct() *DataFrame
- func (df *DataFrame) DistinctBy(columns ...string) *DataFrame
- func (df *DataFrame) Drop(columns ...string) *DataFrame
- func (df *DataFrame) DropNA() *DataFrame
- func (df *DataFrame) DropNASubset(columns ...string) *DataFrame
- func (df *DataFrame) FillNA(value any) *DataFrame
- func (df *DataFrame) FillNAByColumn(values map[string]any) *DataFrame
- func (df *DataFrame) Filter(fn func(row map[string]any) bool) *DataFrame
- func (df *DataFrame) GroupBy(columns ...string) (*GroupedDataFrame, error)
- func (df *DataFrame) Head(n int) *DataFrame
- func (df *DataFrame) Join(other *DataFrame, opt JoinOption) (*DataFrame, error)
- func (df *DataFrame) Len() int
- func (df *DataFrame) Limit(n int) *DataFrame
- func (df *DataFrame) Mutate(column string, fn func(row map[string]any) any) *DataFrame
- func (df *DataFrame) Rename(oldName, newName string) (*DataFrame, error)
- func (df *DataFrame) RenameColumns(renames map[string]string) (*DataFrame, error)
- func (df *DataFrame) Select(columns ...string) *DataFrame
- func (df *DataFrame) Sort(column string, order SortOrder) (*DataFrame, error)
- func (df *DataFrame) SortBy(options ...SortOption) (*DataFrame, error)
- func (df *DataFrame) Tail(n int) *DataFrame
- func (df *DataFrame) ToCSV(path string) error
- func (df *DataFrame) ToRecords() []map[string]any
- func (df *DataFrame) ToTSV(path string) error
- type FileType
- type GroupedDataFrame
- func (gdf *GroupedDataFrame) Agg(column string, fn AggFunc) (*DataFrame, error)
- func (gdf *GroupedDataFrame) Count() *DataFrame
- func (gdf *GroupedDataFrame) Max(column string) (*DataFrame, error)
- func (gdf *GroupedDataFrame) Mean(column string) (*DataFrame, error)
- func (gdf *GroupedDataFrame) Min(column string) (*DataFrame, error)
- func (gdf *GroupedDataFrame) Sum(column string) (*DataFrame, error)
- type JoinOption
- type JoinType
- type SortOption
- type SortOrder
Examples ¶
- Package
- Package (ComplexOperations)
- Package (Concat)
- Package (ConcatAll)
- Package (CustomAggregation)
- Package (DataframePipeline)
- Package (DropRename)
- Package (FileFormats)
- Package (GlobalAggregation)
- Package (HandleMissingValues)
- Package (HeadTailLimit)
- Package (Join)
- Package (JoinTypes)
- Package (SortAndDistinct)
Constants ¶
const ( // CSV represents CSV file type CSV = fileparser.CSV // TSV represents TSV file type TSV = fileparser.TSV // LTSV represents LTSV file type LTSV = fileparser.LTSV // Parquet represents Parquet file type Parquet = fileparser.Parquet // XLSX represents Excel XLSX file type XLSX = fileparser.XLSX // Compressed CSV variants CSVGZ = fileparser.CSVGZ CSVBZ2 = fileparser.CSVBZ2 CSVXZ = fileparser.CSVXZ CSVZSTD = fileparser.CSVZSTD // Compressed TSV variants TSVGZ = fileparser.TSVGZ TSVBZ2 = fileparser.TSVBZ2 TSVXZ = fileparser.TSVXZ TSVZSTD = fileparser.TSVZSTD // Compressed LTSV variants LTSVGZ = fileparser.LTSVGZ LTSVBZ2 = fileparser.LTSVBZ2 LTSVXZ = fileparser.LTSVXZ LTSVZSTD = fileparser.LTSVZSTD // Compressed Parquet variants ParquetGZ = fileparser.ParquetGZ ParquetBZ2 = fileparser.ParquetBZ2 ParquetXZ = fileparser.ParquetXZ ParquetZSTD = fileparser.ParquetZSTD // Compressed XLSX variants XLSXGZ = fileparser.XLSXGZ XLSXBZ2 = fileparser.XLSXBZ2 XLSXXZ = fileparser.XLSXXZ XLSXZSTD = fileparser.XLSXZSTD // ZLIB compressed variants CSVZLIB = fileparser.CSVZLIB TSVZLIB = fileparser.TSVZLIB LTSVZLIB = fileparser.LTSVZLIB ParquetZLIB = fileparser.ParquetZLIB XLSXZLIB = fileparser.XLSXZLIB // Snappy compressed variants CSVSNAPPY = fileparser.CSVSNAPPY TSVSNAPPY = fileparser.TSVSNAPPY LTSVSNAPPY = fileparser.LTSVSNAPPY ParquetSNAPPY = fileparser.ParquetSNAPPY XLSXSNAPPY = fileparser.XLSXSNAPPY // S2 compressed variants CSVS2 = fileparser.CSVS2 TSVS2 = fileparser.TSVS2 LTSVS2 = fileparser.LTSVS2 ParquetS2 = fileparser.ParquetS2 XLSXS2 = fileparser.XLSXS2 // LZ4 compressed variants CSVLZ4 = fileparser.CSVLZ4 TSVLZ4 = fileparser.TSVLZ4 LTSVLZ4 = fileparser.LTSVLZ4 ParquetLZ4 = fileparser.ParquetLZ4 XLSXLZ4 = fileparser.XLSXLZ4 )
Supported file types (re-exported from fileparser)
Variables ¶
var ErrColumnNotFound = errors.New("column not found")
ErrColumnNotFound is returned when a specified column does not exist in the DataFrame.
Functions ¶
This section is empty.
Types ¶
type AggFunc ¶
AggFunc is a function type for custom aggregation. It receives a slice of values from the same group and returns the aggregated result.
var AggCount AggFunc = func(values []any) any { count := 0 for _, v := range values { if v != nil { count++ } } return int64(count) }
AggCount counts the number of non-nil values. Returns int64 count. Note: this counts all non-nil values, not just numeric ones.
var AggMax AggFunc = func(values []any) any { maxVal := -math.MaxFloat64 found := false for _, v := range values { if f, ok := toFloat64(v); ok { if f > maxVal { maxVal = f } found = true } } if !found { return nil } return maxVal }
AggMax finds the maximum numeric value. Non-numeric values (including nil, strings, bools) are silently ignored. Returns nil if no numeric values exist. Otherwise returns float64.
var AggMean AggFunc = func(values []any) any { sum := 0.0 count := 0 for _, v := range values { if f, ok := toFloat64(v); ok { sum += f count++ } } if count == 0 { return nil } return sum / float64(count) }
AggMean calculates the arithmetic mean of numeric values. Non-numeric values (including nil, strings, bools) are silently ignored. Returns nil if no numeric values exist. Otherwise returns float64.
var AggMin AggFunc = func(values []any) any { minVal := math.MaxFloat64 found := false for _, v := range values { if f, ok := toFloat64(v); ok { if f < minVal { minVal = f } found = true } } if !found { return nil } return minVal }
AggMin finds the minimum numeric value. Non-numeric values (including nil, strings, bools) are silently ignored. Returns nil if no numeric values exist. Otherwise returns float64.
var AggSum AggFunc = func(values []any) any {
sum := 0.0
for _, v := range values {
if f, ok := toFloat64(v); ok {
sum += f
}
}
return sum
}
AggSum calculates the sum of numeric values. Non-numeric values (including nil, strings, bools) are silently ignored. Returns 0.0 if all values are non-numeric. Always returns float64.
type DataFrame ¶
type DataFrame struct {
// contains filtered or unexported fields
}
DataFrame is a simple representation of tabular data. It stores data in row-oriented format with immediate execution (no lazy evaluation).
func ConcatAll ¶
ConcatAll concatenates multiple DataFrames vertically, automatically handling different column sets by taking the union of all columns. This is a standalone function (not a method) that accepts any number of DataFrames.
Column Handling:
- Columns from all DataFrames are collected into a union set
- Columns are sorted alphabetically for deterministic output
- Missing values in rows are set to nil
Nil DataFrames are silently skipped, making this safe for optional data.
Use Cases:
- Combining data from different sources with overlapping schemas
- Merging datasets that evolved over time with different columns
- Appending new data with additional fields to existing data
Example - Combining data with different schemas:
users := fileframe.NewDataFrameFromRecords([]map[string]any{
{"name": "Alice", "age": 30},
})
contacts := fileframe.NewDataFrameFromRecords([]map[string]any{
{"name": "Bob", "email": "bob@example.com"},
})
result, err := fileframe.ConcatAll(users, contacts)
// Result columns: ["age", "email", "name"] (sorted alphabetically)
// Alice has nil for email, Bob has nil for age
Example - Combining CSV and TSV data:
csv, _ := fileframe.NewDataFrameFromPath("users.csv")
tsv, _ := fileframe.NewDataFrameFromPath("extra_info.tsv")
combined, err := fileframe.ConcatAll(csv, tsv)
func NewDataFrame ¶
NewDataFrame creates a DataFrame from an io.Reader. It supports CSV, TSV, LTSV, XLSX, and Parquet formats.
Example:
f, _ := os.Open("data.csv")
defer f.Close()
df, err := fileframe.NewDataFrame(f, fileframe.CSV)
func NewDataFrameFromPath ¶
NewDataFrameFromPath creates a DataFrame from a file path. It automatically detects the file type and handles compressed files (gzip, bzip2, xz, zstd, zlib, snappy, s2, lz4).
Supported formats: CSV, TSV, LTSV, XLSX, Parquet, and their compressed variants. For XLSX files with multiple sheets, the first sheet is used.
Example:
df, err := fileframe.NewDataFrameFromPath("data.csv.gz")
df, err := fileframe.NewDataFrameFromPath("data.csv.snappy")
func NewDataFrameFromRecords ¶
NewDataFrameFromRecords creates a DataFrame from a slice of maps. Each map represents a row with column names as keys. Column order is determined by processing records in order, and within each record, keys are sorted alphabetically. New columns are appended as they are encountered.
Example:
records := []map[string]any{
{"name": "Alice", "age": 30},
{"name": "Bob", "age": 25},
}
df := fileframe.NewDataFrameFromRecords(records)
func (*DataFrame) Concat ¶
Concat concatenates multiple DataFrames vertically (row-wise). This is useful for combining data from multiple sources with the same schema.
Requirements:
- All DataFrames must have exactly the same columns in the same order
- If columns differ, use ConcatAll instead for flexible concatenation
Returns an error if:
- Any DataFrame is nil
- Columns don't match (different names or different order)
Example - Combining monthly data:
jan := fileframe.NewDataFrameFromRecords([]map[string]any{
{"month": "Jan", "sales": 100},
})
feb := fileframe.NewDataFrameFromRecords([]map[string]any{
{"month": "Feb", "sales": 150},
})
mar := fileframe.NewDataFrameFromRecords([]map[string]any{
{"month": "Mar", "sales": 200},
})
quarterly, err := jan.Concat(feb, mar)
// Result: 3 rows with all monthly data
Example - Combining data from multiple CSV files:
df1, _ := fileframe.NewDataFrameFromPath("data_2024_01.csv")
df2, _ := fileframe.NewDataFrameFromPath("data_2024_02.csv")
combined, err := df1.Concat(df2)
func (*DataFrame) Distinct ¶
Distinct returns a new DataFrame with duplicate rows removed. Two rows are considered duplicates if all their column values are equal.
Example:
unique := df.Distinct()
func (*DataFrame) DistinctBy ¶
DistinctBy returns a new DataFrame with duplicate rows removed based on the specified columns only.
Example:
unique := df.DistinctBy("name", "email")
func (*DataFrame) Drop ¶
Drop returns a new DataFrame with the specified columns removed. Columns that do not exist are silently ignored.
Example:
dropped := df.Drop("temp_col", "debug_col")
func (*DataFrame) DropNA ¶
DropNA returns a new DataFrame with rows containing nil values removed. By default, removes rows where any column has a nil value.
Example:
cleaned := df.DropNA()
func (*DataFrame) DropNASubset ¶
DropNASubset returns a new DataFrame with rows removed where any of the specified columns have nil values.
Example:
cleaned := df.DropNASubset("required_field1", "required_field2")
func (*DataFrame) FillNA ¶
FillNA returns a new DataFrame with nil values replaced by the specified value.
Example:
filled := df.FillNA(0) // Replace all nil with 0
func (*DataFrame) FillNAByColumn ¶
FillNAByColumn returns a new DataFrame with nil values replaced by column-specific values. Columns not in the map retain their nil values.
Example:
filled := df.FillNAByColumn(map[string]any{
"age": 0,
"name": "Unknown",
"active": false,
})
func (*DataFrame) Filter ¶
Filter returns a new DataFrame containing only rows that satisfy the predicate. The predicate function receives a copy of each row to prevent accidental mutation of the original DataFrame.
Example:
filtered := df.Filter(func(row map[string]any) bool {
age, ok := row["age"].(int64)
return ok && age >= 18
})
func (*DataFrame) GroupBy ¶
func (df *DataFrame) GroupBy(columns ...string) (*GroupedDataFrame, error)
GroupBy groups the DataFrame by the specified columns. Returns a GroupedDataFrame that can be used with aggregation functions. Returns an error if any of the specified columns do not exist in the DataFrame.
Example:
grouped, err := df.GroupBy("category")
if err != nil {
log.Fatal(err)
}
result := grouped.Sum("amount")
func (*DataFrame) Head ¶
Head returns a new DataFrame with the first n rows. If n is greater than the number of rows, all rows are returned. If n is negative, returns an empty DataFrame.
Example:
first10 := df.Head(10)
func (*DataFrame) Join ¶
func (df *DataFrame) Join(other *DataFrame, opt JoinOption) (*DataFrame, error)
Join combines two DataFrames based on a common column or column pair. This method enables SQL-like join operations between DataFrames.
Join Types:
- InnerJoin: Returns only matching rows from both DataFrames
- LeftJoin: Returns all left rows, with nil for unmatched right columns
- RightJoin: Returns all right rows, with nil for unmatched left columns
- OuterJoin: Returns all rows from both, with nil for unmatched columns
Column Handling:
- The join column from the right DataFrame is excluded from the result
- Conflicting column names are prefixed with "right_"
- Result column order: left columns first, then right columns
Limitations:
- Currently supports joining on a single column pair (1 or 2 columns in On)
- For complex joins with multiple keys, consider using filesql
Example - Inner Join with same column name:
users := fileframe.NewDataFrameFromRecords([]map[string]any{
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
})
orders := fileframe.NewDataFrameFromRecords([]map[string]any{
{"id": 1, "product": "Laptop"},
{"id": 1, "product": "Mouse"},
})
result, err := users.Join(orders, fileframe.JoinOption{
On: []string{"id"},
How: fileframe.InnerJoin,
})
// Result: [{id:1, name:Alice, product:Laptop}, {id:1, name:Alice, product:Mouse}]
Example - Left Join with different column names:
users := fileframe.NewDataFrameFromRecords([]map[string]any{
{"user_id": 1, "name": "Alice"},
{"user_id": 2, "name": "Bob"},
{"user_id": 3, "name": "Charlie"},
})
orders := fileframe.NewDataFrameFromRecords([]map[string]any{
{"customer_id": 1, "product": "Laptop"},
})
result, err := users.Join(orders, fileframe.JoinOption{
On: []string{"user_id", "customer_id"},
How: fileframe.LeftJoin,
})
// Result includes all 3 users; Bob and Charlie have nil for product
func (*DataFrame) Limit ¶
Limit is an alias for Head. Returns a new DataFrame with the first n rows.
Example:
limited := df.Limit(100)
func (*DataFrame) Mutate ¶
Mutate returns a new DataFrame with a new or modified column. The function receives a copy of each row and returns the value for the new column. The original DataFrame is not modified.
If the column name is empty or the function is nil, Mutate returns a clone of the original DataFrame without any modifications.
Example:
mutated := df.Mutate("full_name", func(row map[string]any) any {
first := row["first_name"].(string)
last := row["last_name"].(string)
return first + " " + last
})
func (*DataFrame) Rename ¶
Rename returns a new DataFrame with the specified column renamed. Returns an error if the old column does not exist or if the new column name already exists.
Example:
renamed, err := df.Rename("old_name", "new_name")
func (*DataFrame) RenameColumns ¶
RenameColumns returns a new DataFrame with multiple columns renamed. The renames map specifies old name -> new name mappings. Returns an error if any old column does not exist or if any new name conflicts.
Example:
renamed, err := df.RenameColumns(map[string]string{
"col1": "column_one",
"col2": "column_two",
})
func (*DataFrame) Select ¶
Select returns a new DataFrame with only the specified columns. Columns that do not exist are silently ignored.
Example:
selected := df.Select("name", "age")
func (*DataFrame) Sort ¶
Sort returns a new DataFrame sorted by the specified column. Supports sorting by string, int64, and float64 values. Nil values are placed at the end regardless of sort order.
Example:
sorted := df.Sort("age", fileframe.Ascending)
func (*DataFrame) SortBy ¶
func (df *DataFrame) SortBy(options ...SortOption) (*DataFrame, error)
SortBy returns a new DataFrame sorted by multiple columns. Columns are sorted in the order specified (first column has highest priority).
Example:
sorted, err := df.SortBy(
fileframe.SortOption{Column: "category", Order: fileframe.Ascending},
fileframe.SortOption{Column: "price", Order: fileframe.Descending},
)
func (*DataFrame) Tail ¶
Tail returns a new DataFrame with the last n rows. If n is greater than the number of rows, all rows are returned. If n is negative, returns an empty DataFrame.
Example:
last10 := df.Tail(10)
func (*DataFrame) ToCSV ¶
ToCSV writes the DataFrame to a CSV file.
Example:
err := df.ToCSV("output.csv")
type FileType ¶
type FileType = fileparser.FileType
FileType represents supported file types including compression variants. This is an alias for fileparser.FileType.
type GroupedDataFrame ¶
type GroupedDataFrame struct {
// contains filtered or unexported fields
}
GroupedDataFrame represents a DataFrame grouped by one or more columns.
func (*GroupedDataFrame) Agg ¶
func (gdf *GroupedDataFrame) Agg(column string, fn AggFunc) (*DataFrame, error)
Agg performs a custom aggregation on the specified column. The result column is named "agg_{column}". Returns an error if the specified column does not exist in the DataFrame.
Example:
median, err := grouped.Agg("amount", func(values []any) any {
sorted := sortValues(values)
return sorted[len(sorted)/2]
})
func (*GroupedDataFrame) Count ¶
func (gdf *GroupedDataFrame) Count() *DataFrame
Count returns a DataFrame with the count of rows in each group. The result column is named "count".
Example:
counts := df.GroupBy("category").Count()
func (*GroupedDataFrame) Max ¶
func (gdf *GroupedDataFrame) Max(column string) (*DataFrame, error)
Max returns a DataFrame with the maximum value in the specified column for each group. The result column is named "max_{column}". Returns an error if the specified column does not exist in the DataFrame.
Example:
maximums, err := df.GroupBy("category").Max("amount")
func (*GroupedDataFrame) Mean ¶
func (gdf *GroupedDataFrame) Mean(column string) (*DataFrame, error)
Mean returns a DataFrame with the mean of values in the specified column for each group. The result column is named "mean_{column}". Returns an error if the specified column does not exist in the DataFrame.
Example:
averages, err := df.GroupBy("category").Mean("amount")
func (*GroupedDataFrame) Min ¶
func (gdf *GroupedDataFrame) Min(column string) (*DataFrame, error)
Min returns a DataFrame with the minimum value in the specified column for each group. The result column is named "min_{column}". Returns an error if the specified column does not exist in the DataFrame.
Example:
minimums, err := df.GroupBy("category").Min("amount")
func (*GroupedDataFrame) Sum ¶
func (gdf *GroupedDataFrame) Sum(column string) (*DataFrame, error)
Sum returns a DataFrame with the sum of values in the specified column for each group. The result column is named "sum_{column}". Returns an error if the specified column does not exist in the DataFrame.
Example:
totals, err := df.GroupBy("category").Sum("amount")
type JoinOption ¶
type JoinOption struct {
// On specifies the column(s) to join on.
// If one column is specified, it is used for both DataFrames.
// If two columns are specified, the first is for the left DataFrame and the second for the right.
On []string
// How specifies the type of join (InnerJoin, LeftJoin, RightJoin, OuterJoin).
How JoinType
}
JoinOption specifies options for the Join operation.
On field specifies the join column(s):
- One column: Used for both DataFrames (e.g., On: []string{"id"})
- Two columns: First for left DataFrame, second for right (e.g., On: []string{"id", "user_id"})
How field specifies the join type (InnerJoin, LeftJoin, RightJoin, OuterJoin).
Example:
// Same column name in both DataFrames
opt := fileframe.JoinOption{On: []string{"id"}, How: fileframe.InnerJoin}
// Different column names
opt := fileframe.JoinOption{On: []string{"id", "user_id"}, How: fileframe.LeftJoin}
type JoinType ¶
type JoinType int
JoinType represents the type of join operation. Four join types are supported: InnerJoin, LeftJoin, RightJoin, and OuterJoin.
const ( // InnerJoin returns only rows that have matching values in both DataFrames. // This is the most restrictive join type - rows without matches are excluded. // // Example: If users has ids [1, 2, 3] and orders has user_ids [1, 2, 4], // an inner join returns only rows for users 1 and 2. InnerJoin JoinType = iota // LeftJoin returns all rows from the left DataFrame and matched rows from the right DataFrame. // For left rows without matches, the right columns will have nil values. // // Example: If users has ids [1, 2, 3] and orders has user_ids [1, 2], // a left join returns all 3 users, with user 3 having nil for order columns. LeftJoin // RightJoin returns all rows from the right DataFrame and matched rows from the left DataFrame. // For right rows without matches, the left columns will have nil values. // // Example: If users has ids [1, 2] and orders has user_ids [1, 2, 3], // a right join returns all 3 orders, with order 3 having nil for user columns. RightJoin // OuterJoin returns all rows from both DataFrames. // Unmatched rows will have nil values for columns from the other DataFrame. // This is the most inclusive join type - no rows are excluded. // // Example: If users has ids [1, 2] and orders has user_ids [2, 3], // an outer join returns users 1, 2 and orders 2, 3 (4 rows total). OuterJoin )
type SortOption ¶
type SortOption struct {
// Column is the column name to sort by.
Column string
// Order specifies ascending or descending sort order.
Order SortOrder
}
SortOption specifies options for the Sort operation.
