Stuck with an error in graph mode

I am trying to fit an entire training loop including the epochs into graph mode and I keep getting this error:

in user code:

    File "C:\Users\naray\AppData\Roaming\JetBrains\PyCharm2022.3\scratches\scratch_10.py", line 69, in train  *
        for epoch in tf.range(0, epochs, dtype=tf.int8):

    TypeError: Failed to convert elements of SparseCategoricalAccuracy(name=sparse_categorical_accuracy,dtype=float32) to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.

This is my own exercise, so I am pasting the code here:

import os

import tensorflow as tf
import tensorflow_datasets as tfds

import numpy as np

import time

start = time.time()
os.environ['TF_MIN_GPU_MULTIPROCESSOR_COUNT'] = '1'
batch_size = tf.constant(64)
epochs = tf.constant(10)

train_data, info = tfds.load('fashion_mnist', split='train', with_info=True)
test_data = tfds.load('fashion_mnist', split='test')


def prepare_data(data):
    image = data['image']
    image = tf.reshape(image, (-1,))
    image = tf.cast(image, tf.float32)
    image = image / 255.0
    return image, data['label']


train_data = train_data.map(prepare_data)
train_data = train_data.shuffle(buffer_size=1024).batch(64)

test_data = test_data.map(prepare_data)
test_data = test_data.shuffle(buffer_size=1024).batch(64)


def basic_model():
    inputs = tf.keras.layers.Input(shape=(784,))
    x = tf.keras.layers.Dense(64, activation='relu')(inputs)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    outputs = tf.keras.layers.Dense(10, activation='softmax')(x)

    return tf.keras.Model(inputs=inputs, outputs=outputs)


model = basic_model()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()
train_acc_object = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_object = tf.keras.metrics.SparseCategoricalAccuracy()
train_losses = tf.TensorArray(dtype=tf.float32, size=batch_size, dynamic_size=True)
val_losses = tf.TensorArray(dtype=tf.float32, size=batch_size, dynamic_size=True)


def validate(test_data, model, val_losses, val_acc_object, loss_object):
    print('validation trace')
    for x, ytrue in test_data:
        ypred = model(x)
        loss_value = loss_object(ytrue, ypred)
        val_losses.write(val_losses.size() - 1, loss_value).mark_used()
        val_acc_object.update_state(ytrue, ypred)
    return val_losses, val_acc_object


@tf.function
def train(train_data, test_data, model, loss_object, optimizer,
          train_acc_object, val_acc_object, epochs,
          train_losses,
          val_losses):
    print('train trace', tf.range(0, epochs, dtype=tf.int8))
    for epoch in tf.range(0, epochs, dtype=tf.int8):
        tf.print('epoch: ', epoch)
        for x, ytrue in train_data:
            with tf.GradientTape() as tape:
                ypred = model(x)
                loss_value = loss_object(ytrue, ypred)
            grads = tape.gradient(loss_value, model.variables)
            train_losses.write(train_losses.size() - 1, loss_value).mark_used()
            # tf.print(tf.convert_to_tensor(grads).shape, ' ', tf.convert_to_tensor(model.variables).shape)
            optimizer.apply_gradients(zip(grads, model.variables))
            train_acc_object.update_state(ytrue, ypred)

        val_losses, val_acc_object = validate(test_data, model, val_losses, val_acc_object, loss_object)
        tf.print('train_loss: ', tf.math.reduce_mean(train_losses.stack()), ' train_acc: ', train_acc_object.result(),
                 ' val_loss: ', tf.math.reduce_mean(val_losses.stack()), ' val_accuracy: ', val_acc_object.result())
        train_acc_object.reset_state()
        val_acc_object.reset_state()

try:
    train(train_data=train_data, test_data=test_data, model=model, loss_object=loss_object, optimizer=optimizer, train_acc_object=train_acc_object, val_acc_object=val_acc_object, epochs=epochs, train_losses=train_losses, val_losses=val_losses)
except TypeError as e:
    print(tf.autograph.to_code(train.python_function))
    print(e)
print('time: ', time.time() - start)

Here is the graph generator:

def tf__train(train_data, test_data, model, loss_object, optimizer, train_acc_object, val_acc_object, epochs, train_losses, val_losses):
    with ag__.FunctionScope('train', 'fscope', ag__.ConversionOptions(recursive=True, user_requested=True, optional_features=(), internal_convert_user_code=True)) as fscope:
        ag__.ld(print)('train trace', ag__.converted_call(ag__.ld(tf).range, (0, ag__.ld(epochs)), dict(dtype=ag__.ld(tf).int8), fscope))

        def get_state_1():
            return (val_acc_object, val_losses)

        def set_state_1(vars_):
            nonlocal val_acc_object, val_losses
            (val_acc_object, val_losses) = vars_

        def loop_body_1(itr_1):
            nonlocal val_acc_object, val_losses
            epoch = itr_1
            ag__.converted_call(ag__.ld(tf).print, ('epoch: ', ag__.ld(epoch)), None, fscope)

            def get_state():
                return ()

            def set_state(block_vars):
                pass

            def loop_body(itr):
                (x, ytrue) = itr
                with ag__.ld(tf).GradientTape() as tape:
                    ypred = ag__.converted_call(ag__.ld(model), (ag__.ld(x),), None, fscope)
                    loss_value = ag__.converted_call(ag__.ld(loss_object), (ag__.ld(ytrue), ag__.ld(ypred)), None, fscope)
                grads = ag__.converted_call(ag__.ld(tape).gradient, (ag__.ld(loss_value), ag__.ld(model).variables), None, fscope)
                ag__.converted_call(ag__.converted_call(ag__.ld(train_losses).write, (ag__.converted_call(ag__.ld(train_losses).size, (), None, fscope) - 1, ag__.ld(loss_value)), None, fscope).mark_used, (), None, fscope)
                ag__.converted_call(ag__.ld(optimizer).apply_gradients, (ag__.converted_call(ag__.ld(zip), (ag__.ld(grads), ag__.ld(model).variables), None, fscope),), None, fscope)
                ag__.converted_call(ag__.ld(train_acc_object).update_state, (ag__.ld(ytrue), ag__.ld(ypred)), None, fscope)
            ag__.for_stmt(ag__.ld(train_data), None, loop_body, get_state, set_state, (), {'iterate_names': '(x, ytrue)'})
            (val_losses, val_acc_object) = ag__.converted_call(ag__.ld(validate), (ag__.ld(test_data), ag__.ld(model), ag__.ld(val_losses), ag__.ld(val_acc_object), ag__.ld(loss_object)), None, fscope)
            ag__.converted_call(ag__.ld(tf).print, ('train_loss: ', ag__.converted_call(ag__.ld(tf).math.reduce_mean, (ag__.converted_call(ag__.ld(train_losses).stack, (), None, fscope),), None, fscope), ' train_acc: ', ag__.converted_call(ag__.ld(train_acc_object).result, (), None, fscope), ' val_loss: ', ag__.converted_call(ag__.ld(tf).math.reduce_mean, (ag__.converted_call(ag__.ld(val_losses).stack, (), None, fscope),), None, fscope), ' val_accuracy: ', ag__.converted_call(ag__.ld(val_acc_object).result, (), None, fscope)), None, fscope)
        ypred = ag__.Undefined('ypred')
        tape = ag__.Undefined('tape')
        ytrue = ag__.Undefined('ytrue')
        grads = ag__.Undefined('grads')
        x = ag__.Undefined('x')
        epoch = ag__.Undefined('epoch')
        loss_value = ag__.Undefined('loss_value')
        ag__.for_stmt(ag__.converted_call(ag__.ld(tf).range, (0, ag__.ld(epochs)), dict(dtype=ag__.ld(tf).int8), fscope), None, loop_body_1, get_state_1, set_state_1, ('val_acc_object', 'val_losses'), {'iterate_names': 'epoch'})

The error occurs at the epoch loop but says it is about SparseCategoricalCrossentropy, which is what I don’t understand. Please help me with this.

What would happen if you use range() instead if tf.range()?

Converting to python range still gives the same error. I’m assuming the problem is that the objects of SparseCategoricalAccuracy metric are not being received by the graph. Is there a way to convert these to normal tensors or pass it in a way that a graph can accept it?

Can you try categoricalcrossentropy instead, if the labels are one hot encoded thats what you need!

I was able to run it without errors in eager mode, so I don’t think the problem is related to the loss object. It has to be with the graph mode.
If I try to return something like a metric object or a model from the decorated function, I get this error. Otherwise, I am able to run it in graph mode without including the loop and by returning only tensors from the decorated function.

Here is my program without including loops in graph mode:

import os

import tensorflow as tf
import tensorflow_datasets as tfds

import numpy as np

import time

start = time.time()
os.environ['TF_MIN_GPU_MULTIPROCESSOR_COUNT'] = '1'
batch_size = tf.constant(64)
epochs = tf.constant(10)

train_data, info = tfds.load('fashion_mnist', split='train', with_info=True)
test_data = tfds.load('fashion_mnist', split='test')


def prepare_data(data):
    image = data['image']
    image = tf.reshape(image, (-1,))
    image = tf.cast(image, tf.float32)
    image = image / 255.0
    return image, data['label']


train_data = train_data.map(prepare_data)
train_data = train_data.shuffle(buffer_size=1024).batch(64).prefetch(1)

test_data = test_data.map(prepare_data)
test_data = test_data.shuffle(buffer_size=1024).batch(64).prefetch(1)


def basic_model():
    inputs = tf.keras.layers.Input(shape=(784,))
    x = tf.keras.layers.Dense(64, activation='relu')(inputs)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    outputs = tf.keras.layers.Dense(10, activation='softmax')(x)

    return tf.keras.Model(inputs=inputs, outputs=outputs)


model = basic_model()

# loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
# optimizer = tf.keras.optimizers.Adam()
# train_acc_object = tf.keras.metrics.SparseCategoricalAccuracy()
# val_acc_object = tf.keras.metrics.SparseCategoricalAccuracy()
# train_losses = tf.TensorArray(dtype=tf.float32, size=batch_size, dynamic_size=True)
# val_losses = tf.TensorArray(dtype=tf.float32, size=batch_size, dynamic_size=True)
loss_value = tf.Variable(0)

@tf.function
def train_step( x, ytrue, model, loss_object, optimizer):
    print('train_step trace')
    with tf.GradientTape() as tape:
        ypred = model(x)
        loss_value = loss_object(ytrue, ypred)
    grads = tape.gradient(loss_value, model.variables)
    optimizer.apply_gradients(zip(grads, model.variables))
    return loss_value, ypred


@tf.function
def validation_step(x, ytrue, model, loss_object):
    print('validation_step trace')
    ypred = model(x)
    loss_value = loss_object(ytrue, ypred)
    return loss_value, ypred


def train(model=model, train_data=train_data, test_data=test_data):
    print('train trace')
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
    optimizer = tf.keras.optimizers.Adam()
    train_acc_object = tf.keras.metrics.SparseCategoricalAccuracy()
    val_acc_object = tf.keras.metrics.SparseCategoricalAccuracy()
    train_losses = tf.TensorArray(dtype=tf.float32, size=batch_size, dynamic_size=True)
    val_losses = tf.TensorArray(dtype=tf.float32, size=batch_size, dynamic_size=True)
    for epoch in tf.range(epochs, dtype=tf.int32):
        print('epoch ',epoch)
        for i, (x, ytrue) in train_data.enumerate():
            loss_value, ypred = train_step(x, ytrue, model, loss_object, optimizer)
        train_losses.write(i, loss_value).mark_used()
        train_acc_object.update_state(ytrue, ypred)

        for i, (x, ytrue) in test_data.enumerate():
            loss_value, ypred = validation_step(x, ytrue, model, loss_object)
        val_losses.write(i, loss_value).mark_used()
        val_acc_object.update_state(ytrue, ypred)

        print('train_error: ', tf.math.reduce_mean(train_losses.stack()), ' train_acc: ',train_acc_object.result(),
              ' val_error: ', tf.math.reduce_mean(val_losses.stack()), ' val_acc: ',val_acc_object.result())


try:
    train()
except TypeError as e:
    print(tf.autograph.to_code(train_step.python_function))
    print(tf.autograph.to_code(validation_step.python_function))
    print(e)

print('time: ',time.time() - start)

Here is my output in graph mode without including loops:

train trace
epoch  tf.Tensor(0, shape=(), dtype=int32)
train_step trace
train_step trace
train_step trace
validation_step trace
validation_step trace
train_error:  tf.Tensor(0.00022745856, shape=(), dtype=float32)  train_acc:  tf.Tensor(0.96875, shape=(), dtype=float32)  val_error:  tf.Tensor(0.0037840025, shape=(), dtype=float32)  val_acc:  tf.Tensor(0.8125, shape=(), dtype=float32)
epoch  tf.Tensor(1, shape=(), dtype=int32)
train_error:  tf.Tensor(0.00012775861, shape=(), dtype=float32)  train_acc:  tf.Tensor(0.953125, shape=(), dtype=float32)  val_error:  tf.Tensor(0.00090734364, shape=(), dtype=float32)  val_acc:  tf.Tensor(0.875, shape=(), dtype=float32)
epoch  tf.Tensor(2, shape=(), dtype=int32)
train_error:  tf.Tensor(0.00030771855, shape=(), dtype=float32)  train_acc:  tf.Tensor(0.9270833, shape=(), dtype=float32)  val_error:  tf.Tensor(0.0017645856, shape=(), dtype=float32)  val_acc:  tf.Tensor(0.8541667, shape=(), dtype=float32)
epoch  tf.Tensor(3, shape=(), dtype=int32)
train_error:  tf.Tensor(7.789134e-05, shape=(), dtype=float32)  train_acc:  tf.Tensor(0.9453125, shape=(), dtype=float32)  val_error:  tf.Tensor(0.0062182425, shape=(), dtype=float32)  val_acc:  tf.Tensor(0.8125, shape=(), dtype=float32)
epoch  tf.Tensor(4, shape=(), dtype=int32)
train_error:  tf.Tensor(0.00033706325, shape=(), dtype=float32)  train_acc:  tf.Tensor(0.9375, shape=(), dtype=float32)  val_error:  tf.Tensor(0.0020649936, shape=(), dtype=float32)  val_acc:  tf.Tensor(0.825, shape=(), dtype=float32)
epoch  tf.Tensor(5, shape=(), dtype=int32)
train_error:  tf.Tensor(0.00033246292, shape=(), dtype=float32)  train_acc:  tf.Tensor(0.9270833, shape=(), dtype=float32)  val_error:  tf.Tensor(0.0013289611, shape=(), dtype=float32)  val_acc:  tf.Tensor(0.8333333, shape=(), dtype=float32)
epoch  tf.Tensor(6, shape=(), dtype=int32)
train_error:  tf.Tensor(0.0001625357, shape=(), dtype=float32)  train_acc:  tf.Tensor(0.92410713, shape=(), dtype=float32)  val_error:  tf.Tensor(0.0009809914, shape=(), dtype=float32)  val_acc:  tf.Tensor(0.84821427, shape=(), dtype=float32)
epoch  tf.Tensor(7, shape=(), dtype=int32)
train_error:  tf.Tensor(0.00033120092, shape=(), dtype=float32)  train_acc:  tf.Tensor(0.91796875, shape=(), dtype=float32)  val_error:  tf.Tensor(0.0024140156, shape=(), dtype=float32)  val_acc:  tf.Tensor(0.8515625, shape=(), dtype=float32)
epoch  tf.Tensor(8, shape=(), dtype=int32)
train_error:  tf.Tensor(0.00021466451, shape=(), dtype=float32)  train_acc:  tf.Tensor(0.9131944, shape=(), dtype=float32)  val_error:  tf.Tensor(0.002004218, shape=(), dtype=float32)  val_acc:  tf.Tensor(0.8472222, shape=(), dtype=float32)
epoch  tf.Tensor(9, shape=(), dtype=int32)
train_error:  tf.Tensor(0.00012499285, shape=(), dtype=float32)  train_acc:  tf.Tensor(0.915625, shape=(), dtype=float32)  val_error:  tf.Tensor(0.00043017257, shape=(), dtype=float32)  val_acc:  tf.Tensor(0.8625, shape=(), dtype=float32)
time:  27.27366614341736

Process finished with exit code 0

I do not have any other thoughts for it to be frank with you!