3

I am basically trying to do a forward fill imputation. Below is the code for that.

df = spark.createDataFrame([(1,1, None), (1,2, 5), (1,3, None), (1,4, None), (1,5, 10), (1,6, None)], ('session',"timestamp", "id"))

PRV_RANK = 0.0
def fun(rank):
    ########How to check if None or Nan?  ###############
    if rank is None or rank is NaN:
        return PRV_RANK
    else:
        PRV_RANK = rank
        return rank        

fuN= F.udf(fun, IntegerType())

df.withColumn("ffill_new", fuN(df["id"])).show()

I am getting weird error in the log.

Edit: The question is related to how to identify null & nan in spark dataframe using python.

Edit: I am assuming the below line of code which checks for NaN & Null is causing the issue. So I have given the title accordingly for this question.

Traceback (most recent call last):

File "", line 1, in df_na.withColumn("ffill_new", forwardFill(df_na["id"])).show()

File "C:\Spark\python\pyspark\sql\dataframe.py", line 318, in show print(self._jdf.showString(n, 20))

File "C:\Spark\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py", line 1133, in call answer, self.gateway_client, self.target_id, self.name)

File "C:\Spark\python\pyspark\sql\utils.py", line 63, in deco return f(*a, **kw)

File "C:\Spark\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py", line 319, in get_return_value format(target_id, ".", name), value)

Py4JJavaError: An error occurred while calling o806.showString. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 47.0 failed 1 times, most recent failure: Lost task 0.0 in stage 47.0 (TID 83, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "C:\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 174, in main File "C:\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 169, in process File "C:\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 106, in File "C:\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 92, in File "C:\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 70, in File "", line 5, in forwardfil UnboundLocalError: local variable 'PRV_RANK' referenced before assignment

at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193) at org.apache.spark.api.python.PythonRunner$$anon$1.(PythonRDD.scala:234) at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152) at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:144) at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:87) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:99) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:748)

Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951) at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:333) at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38) at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2386) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57) at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2788) at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2385) at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2392) at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2128) at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2127) at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2818) at org.apache.spark.sql.Dataset.head(Dataset.scala:2127) at org.apache.spark.sql.Dataset.take(Dataset.scala:2342) at org.apache.spark.sql.Dataset.showString(Dataset.scala:248) at sun.reflect.GeneratedMethodAccessor35.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:280) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:214) at java.lang.Thread.run(Thread.java:748) Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "C:\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 174, in main File "C:\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 169, in process File "C:\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 106, in File "C:\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 92, in File "C:\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 70, in File "", line 5, in forwardfil UnboundLocalError: local variable 'PRV_RANK' referenced before assignment

at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193) at org.apache.spark.api.python.PythonRunner$$anon$1.(PythonRDD.scala:234) at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152) at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:144) at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:87) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:99) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 1 more

4
  • Is that the end of the stack trace? Commented Jun 12, 2017 at 8:09
  • Possible duplicate of How to check for NaN in python? Commented Jun 12, 2017 at 8:16
  • your fuN spelling is wrong Commented Jun 12, 2017 at 8:25
  • your question is still misleading. review it please Commented Jun 12, 2017 at 11:29

1 Answer 1

0
df.withColumn("ffill_new", f.UserDefinedFunction(lambda x: x or 0, IntegerType())(df["id"])).show()
Sign up to request clarification or add additional context in comments.

1 Comment

While this code may answer the question, providing additional context regarding how and/or why it solves the problem would improve the answer's long-term value.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.