@@ -111,15 +111,18 @@ def set_defaults(validator, properties, instance, schema):
111111 if not isinstance (data , dict ):
112112 try :
113113 import pandas as pd
114- import pandas as pl
114+ import polars as pl
115115
116116 records = []
117117 if isinstance (data , pd .DataFrame ):
118+ logger .debug ("Validating pandas DataFrame" )
118119 records = data .to_dict ("records" )
119120 elif isinstance (data , pl .DataFrame ):
121+ logger .debug ("Validating polars DataFrame" )
120122 records = data .iter_rows (named = True )
121123 elif isinstance (data , pl .LazyFrame ):
122124 # If a LazyFrame is being used, probably it is a large dataframe (so check only first 1000 records)
125+ logger .debug ("Validating first 1000 rows of polars LazyFrame" )
123126 records = data .head (1000 ).collect ().iter_rows (named = True )
124127 else :
125128 raise WorkflowError ("Unsupported data type for validation." )
@@ -136,18 +139,39 @@ def set_defaults(validator, properties, instance, schema):
136139 jsonschema .validate (record , schema , resolver = resolver )
137140 except jsonschema .exceptions .ValidationError as e :
138141 raise WorkflowError (f"Error validating row { i } of data frame." , e )
142+
139143 if set_default :
140- newdata = pd .DataFrame (recordlist , data .index )
141- newcol = ~ newdata .columns .isin (data .columns )
142- n = len (data .columns )
143- for col in newdata .loc [:, newcol ].columns :
144- data .insert (n , col , newdata .loc [:, col ])
145- n = n + 1
144+ if isinstance (data , pd .DataFrame ):
145+ newdata = pd .DataFrame (recordlist , data .index )
146+ # Add missing columns
147+ newcol = newdata .columns [~ newdata .columns .isin (data .columns )]
148+ data [newcol ] = None
149+ # Fill in None values with values from newdata
150+ data .update (newdata )
151+ elif isinstance (data , pl .DataFrame ):
152+ newdata = pl .DataFrame (recordlist )
153+ # Add missing columns
154+ newcol = [col for col in newdata .columns if col not in data .columns ]
155+ [
156+ data .insert_column (
157+ len (data .columns ),
158+ pl .lit (None , newdata [col ].dtype ).alias (col ),
159+ )
160+ for col in newcol
161+ ]
162+ # Fill in None values with values from newdata
163+ for i in range (data .shape [0 ]):
164+ for j in range (data .shape [1 ]):
165+ if data [i , j ] == None :
166+ data [i , j ] = newdata [i , j ]
167+ elif isinstance (data , pl .LazyFrame ):
168+ logger .warning ("LazyFrame does not support setting default values." )
146169 return
147170 except ImportError :
148171 pass
149172 raise WorkflowError ("Error validating data frame." )
150173 else :
174+ logger .debug ("Validating dict" )
151175 try :
152176 if set_default :
153177 DefaultValidator (schema , resolver = resolver ).validate (data )
0 commit comments