Skip to content

Commit bfc2fe2

Browse files
committed
Re-implement setting of default values
1 parent 58287f0 commit bfc2fe2

5 files changed

Lines changed: 92 additions & 18 deletions

File tree

snakemake/utils.py

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,15 +111,18 @@ def set_defaults(validator, properties, instance, schema):
111111
if not isinstance(data, dict):
112112
try:
113113
import pandas as pd
114-
import pandas as pl
114+
import polars as pl
115115

116116
records = []
117117
if isinstance(data, pd.DataFrame):
118+
logger.debug("Validating pandas DataFrame")
118119
records = data.to_dict("records")
119120
elif isinstance(data, pl.DataFrame):
121+
logger.debug("Validating polars DataFrame")
120122
records = data.iter_rows(named=True)
121123
elif isinstance(data, pl.LazyFrame):
122124
# If a LazyFrame is being used, probably it is a large dataframe (so check only first 1000 records)
125+
logger.debug("Validating first 1000 rows of polars LazyFrame")
123126
records = data.head(1000).collect().iter_rows(named=True)
124127
else:
125128
raise WorkflowError("Unsupported data type for validation.")
@@ -136,18 +139,39 @@ def set_defaults(validator, properties, instance, schema):
136139
jsonschema.validate(record, schema, resolver=resolver)
137140
except jsonschema.exceptions.ValidationError as e:
138141
raise WorkflowError(f"Error validating row {i} of data frame.", e)
142+
139143
if set_default:
140-
newdata = pd.DataFrame(recordlist, data.index)
141-
newcol = ~newdata.columns.isin(data.columns)
142-
n = len(data.columns)
143-
for col in newdata.loc[:, newcol].columns:
144-
data.insert(n, col, newdata.loc[:, col])
145-
n = n + 1
144+
if isinstance(data, pd.DataFrame):
145+
newdata = pd.DataFrame(recordlist, data.index)
146+
# Add missing columns
147+
newcol = newdata.columns[~newdata.columns.isin(data.columns)]
148+
data[newcol] = None
149+
# Fill in None values with values from newdata
150+
data.update(newdata)
151+
elif isinstance(data, pl.DataFrame):
152+
newdata = pl.DataFrame(recordlist)
153+
# Add missing columns
154+
newcol = [col for col in newdata.columns if col not in data.columns]
155+
[
156+
data.insert_column(
157+
len(data.columns),
158+
pl.lit(None, newdata[col].dtype).alias(col),
159+
)
160+
for col in newcol
161+
]
162+
# Fill in None values with values from newdata
163+
for i in range(data.shape[0]):
164+
for j in range(data.shape[1]):
165+
if data[i, j] == None:
166+
data[i, j] = newdata[i, j]
167+
elif isinstance(data, pl.LazyFrame):
168+
logger.warning("LazyFrame does not support setting default values.")
146169
return
147170
except ImportError:
148171
pass
149172
raise WorkflowError("Error validating data frame.")
150173
else:
174+
logger.debug("Validating dict")
151175
try:
152176
if set_default:
153177
DefaultValidator(schema, resolver=resolver).validate(data)

tests/test_validate/Snakefile

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,68 @@ import pandas as pd
44
import polars as pl
55
from snakemake.utils import validate
66

7+
78
configfile: "config.yaml"
9+
10+
811
validate(config, "config.schema.yaml")
912

1013

11-
samples = pd.read_table(config["samples"]).set_index("sample", drop=False)
14+
# Polars DataFrame
15+
samples = pl.read_csv(
16+
config["samples"],
17+
separator="\t",
18+
schema={"sample": pl.String, "condition": pl.String, "n": pl.UInt8},
19+
null_values="NA",
20+
)
21+
validate(samples, "samples.schema.yaml")
22+
assert samples[0, "tissue"] == "blood"
23+
assert samples[0, "n"] == 1
24+
assert samples[1, "n"] == 0
25+
26+
# Polars LazyFrame
27+
samples = pl.scan_csv(
28+
config["samples"],
29+
separator="\t",
30+
schema={"sample": pl.String, "condition": pl.String, "n": pl.UInt8},
31+
null_values="NA",
32+
)
33+
validate(samples, "samples.schema.yaml", set_default=False)
34+
assert samples.collect()[0, "n"] == 1
35+
36+
# Pandas DataFrame without index
37+
samples = pd.read_table(config["samples"])
1238
validate(samples, "samples.schema.yaml")
39+
assert samples.iloc[0]["tissue"] == "blood"
40+
assert samples.iloc[0]["n"] == 1
41+
assert samples.iloc[1]["n"] == 0
1342

14-
samples = pl.read_csv(config["samples"], separator="\t")
43+
# Dict
44+
df = pd.read_table(config["samples"])
45+
samples = df.iloc[0].to_dict()
46+
validate(samples, "samples.schema.yaml")
47+
assert samples["tissue"] == "blood"
48+
assert samples["n"] == 1
49+
samples = {k: v for k, v in df.iloc[1].to_dict().items() if pd.notnull(v)}
1550
validate(samples, "samples.schema.yaml")
51+
assert samples["tissue"] == "blood"
52+
assert samples["n"] == 0
1653

17-
samples = pl.scan_csv(config["samples"], separator="\t")
54+
# Pandas DataFrame with index
55+
samples = pd.read_table(config["samples"]).set_index("sample", drop=False)
1856
validate(samples, "samples.schema.yaml")
57+
assert samples.iloc[0]["tissue"] == "blood"
58+
assert samples.iloc[0]["n"] == 1
59+
assert samples.iloc[1]["n"] == 0
60+
1961

2062
rule all:
2163
input:
22-
expand("test.{sample}.txt", sample=samples.index)
64+
expand("test.{sample}.txt", sample=samples.index),
2365

2466

2567
rule a:
2668
output:
27-
"test.{sample}.txt"
69+
"test.{sample}.txt",
2870
shell:
2971
"touch {output}"

tests/test_validate/config.schema.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
$schema: "https://json-schema.org/draft/2020-12/schema#"
1+
$schema: "https://json-schema.org/draft/2020-12/schema"
22

33
description: snakemake configuration file
44

tests/test_validate/samples.schema.yaml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
1-
$schema: "https://json-schema.org/draft/2020-12/schema#"
1+
$schema: "https://json-schema.org/draft/2020-12/schema"
22
description: an entry in the sample sheet
33
properties:
44
sample:
55
type: string
66
description: sample name/identifier
77
condition:
88
type: string
9-
description: sample condition that will be compared during differential expression analysis (e.g. a treatment, a tissue time, a disease)
9+
description: sample condition
10+
n:
11+
type: integer
12+
default: 0
13+
description: replicate count
14+
tissue:
15+
type: string
16+
default: blood
17+
description: sample tissue of origin
1018

1119
required:
1220
- sample

tests/test_validate/samples.tsv

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
sample condition
2-
A tumor
3-
B blood
1+
sample condition n
2+
A case 1
3+
B control NA

0 commit comments

Comments
 (0)