C5 W4 optimization error on Transformer Network Application: Question Answering

I was trying to execute the optimization code on the Hugging Face TFDistilBertForQuestionAnswering model with QA bAbI dataset. The process stuck at the Adam apply_gradients step with following error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[19], line 173
    171 losses.append(loss)
    172 grads = tape.gradient(loss, model.trainable_weights)
--> 173 opt.apply_gradients(zip(grads, model.trainable_weights))
    175 if step % 20 == 0:
    176     print("Training loss (for one batch) at step %d: %.4f"% (step, 
    177                                                            float(loss_start)))

File /opt/conda/lib/python3.10/site-packages/keras/src/optimizers/base_optimizer.py:269, in BaseOptimizer.apply_gradients(self, grads_and_vars)
    267 def apply_gradients(self, grads_and_vars):
    268     grads, trainable_variables = zip(*grads_and_vars)
--> 269     self.apply(grads, trainable_variables)
    270     # Return iterations for compat with tf.keras.
    271     return self.iterations

File /opt/conda/lib/python3.10/site-packages/keras/src/optimizers/base_optimizer.py:308, in BaseOptimizer.apply(self, grads, trainable_variables)
    306 if not self.built:
    307     with backend.name_scope(self.name, caller=self):
--> 308         self.build(trainable_variables)
    309     self.built = True
    310 self._check_variables_are_known(trainable_variables)

File /opt/conda/lib/python3.10/site-packages/keras/src/optimizers/adam.py:93, in Adam.build(self, var_list)
     90 self._velocities = []
     91 for var in var_list:
     92     self._momentums.append(
---> 93         self.add_variable_from_reference(
     94             reference_variable=var, name="momentum"
     95         )
     96     )
     97     self._velocities.append(
     98         self.add_variable_from_reference(
     99             reference_variable=var, name="velocity"
    100         )
    101     )
    102 if self.amsgrad:

File /opt/conda/lib/python3.10/site-packages/keras/src/backend/tensorflow/optimizer.py:33, in TFOptimizer.add_variable_from_reference(self, reference_variable, name, initializer)
     28     colocate_var = reference_variable
     30 with self._distribution_strategy.extended.colocate_vars_with(
     31     colocate_var
     32 ):
---> 33     return super().add_variable_from_reference(
     34         reference_variable, name=name, initializer=initializer
     35     )

File /opt/conda/lib/python3.10/site-packages/keras/src/optimizers/base_optimizer.py:205, in BaseOptimizer.add_variable_from_reference(self, reference_variable, name, initializer)
    203 else:
    204     name = str(reference_variable.name).replace(":", "_") + "_" + name
--> 205 return self.add_variable(
    206     shape=reference_variable.shape,
    207     initializer=initializer,
    208     dtype=reference_variable.dtype,
    209     name=name,
    210 )

File /opt/conda/lib/python3.10/site-packages/keras/src/optimizers/base_optimizer.py:184, in BaseOptimizer.add_variable(self, shape, initializer, dtype, name)
    182 initializer = initializers.get(initializer)
    183 with backend.name_scope(self.name, caller=self):
--> 184     variable = backend.Variable(
    185         initializer=initializer,
    186         shape=shape,
    187         dtype=dtype,
    188         trainable=False,
    189         name=name,
    190     )
    191 self._track_variable(variable)
    192 return variable

File /opt/conda/lib/python3.10/site-packages/keras/src/backend/common/variables.py:19, in KerasVariable.__init__(self, initializer, shape, dtype, trainable, name)
     17 name = name or auto_name(self.__class__.__name__)
     18 if not isinstance(name, str) or "/" in name:
---> 19     raise ValueError(
     20         "Argument `name` must be a string and "
     21         "cannot contain character `/`. "
     22         f"Received: name={name}"
     23     )
     24 self.name = name
     25 parent_path = current_path()

ValueError: Argument `name` must be a string and cannot contain character `/`. Received: name=tf_distil_bert_for_question_answering/distilbert/embeddings/word_embeddings/weight_0_momentum

I noticed the trainable variable names are format like this:

>> model.trainable_weights
<tf.Variable 'tf_distil_bert_for_question_answering/distilbert/embeddings/word_embeddings/weight:0' shape=(30522, 768) dtype=float32, numpy=
 array([[-0.01664949, -0.06661227, -0.01632868, ..., -0.01999032,
         -0.05139988, -0.0263568 ], ...
<tf.Variable 'tf_distil_bert_for_question_answering/distilbert/embeddings/position_embeddings/embeddings:0' shape=(512, 768) dtype=float32, numpy=
 array([[ 1.7505383e-02, -2.5631009e-02, -3.6641564e-02, ...,
<tf.Variable 'tf_distil_bert_for_question_answering/distilbert/embeddings/LayerNorm/gamma:0' shape=(768,) dtype=float32, numpy=
 array([0.84250426, 0.80054885, 0.7313251 , 0.7595404 , 0.79776627,
<tf.Variable 'tf_distil_bert_for_question_answering/distilbert/transformer/layer_._0/attention/q_lin/bias:0' shape=(768,) dtype=float32, numpy=
 array([ 5.41195333e-01, -2.96994209e-01, -4.07260001e-01,  3.45985264e-01,...

Is there a formal way to address this issue?

1 Like

You probably need trainable variables here not the weights!

@gent.spah Seems it map to the same set of values and raised the same error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[5], line 173
    171 losses.append(loss)
    172 grads = tape.gradient(loss, model.trainable_variables)
--> 173 opt.apply_gradients(zip(grads, model.trainable_variables))
    175 if step % 20 == 0:
    176     print("Training loss (for one batch) at step %d: %.4f"% (step, 
    177                                                            float(loss_start)))

File /opt/conda/lib/python3.10/site-packages/keras/src/optimizers/base_optimizer.py:269, in BaseOptimizer.apply_gradients(self, grads_and_vars)
    267 def apply_gradients(self, grads_and_vars):
    268     grads, trainable_variables = zip(*grads_and_vars)
--> 269     self.apply(grads, trainable_variables)
    270     # Return iterations for compat with tf.keras.
    271     return self.iterations

File /opt/conda/lib/python3.10/site-packages/keras/src/optimizers/base_optimizer.py:308, in BaseOptimizer.apply(self, grads, trainable_variables)
    306 if not self.built:
    307     with backend.name_scope(self.name, caller=self):
--> 308         self.build(trainable_variables)
    309     self.built = True
    310 self._check_variables_are_known(trainable_variables)

File /opt/conda/lib/python3.10/site-packages/keras/src/optimizers/adam.py:93, in Adam.build(self, var_list)
     90 self._velocities = []
     91 for var in var_list:
     92     self._momentums.append(
---> 93         self.add_variable_from_reference(
     94             reference_variable=var, name="momentum"
     95         )
     96     )
     97     self._velocities.append(
     98         self.add_variable_from_reference(
     99             reference_variable=var, name="velocity"
    100         )
    101     )
    102 if self.amsgrad:

File /opt/conda/lib/python3.10/site-packages/keras/src/backend/tensorflow/optimizer.py:33, in TFOptimizer.add_variable_from_reference(self, reference_variable, name, initializer)
     28     colocate_var = reference_variable
     30 with self._distribution_strategy.extended.colocate_vars_with(
     31     colocate_var
     32 ):
---> 33     return super().add_variable_from_reference(
     34         reference_variable, name=name, initializer=initializer
     35     )

File /opt/conda/lib/python3.10/site-packages/keras/src/optimizers/base_optimizer.py:205, in BaseOptimizer.add_variable_from_reference(self, reference_variable, name, initializer)
    203 else:
    204     name = str(reference_variable.name).replace(":", "_") + "_" + name
--> 205 return self.add_variable(
    206     shape=reference_variable.shape,
    207     initializer=initializer,
    208     dtype=reference_variable.dtype,
    209     name=name,
    210 )

File /opt/conda/lib/python3.10/site-packages/keras/src/optimizers/base_optimizer.py:184, in BaseOptimizer.add_variable(self, shape, initializer, dtype, name)
    182 initializer = initializers.get(initializer)
    183 with backend.name_scope(self.name, caller=self):
--> 184     variable = backend.Variable(
    185         initializer=initializer,
    186         shape=shape,
    187         dtype=dtype,
    188         trainable=False,
    189         name=name,
    190     )
    191 self._track_variable(variable)
    192 return variable

File /opt/conda/lib/python3.10/site-packages/keras/src/backend/common/variables.py:19, in KerasVariable.__init__(self, initializer, shape, dtype, trainable, name)
     17 name = name or auto_name(self.__class__.__name__)
     18 if not isinstance(name, str) or "/" in name:
---> 19     raise ValueError(
     20         "Argument `name` must be a string and "
     21         "cannot contain character `/`. "
     22         f"Received: name={name}"
     23     )
     24 self.name = name
     25 parent_path = current_path()

ValueError: Argument `name` must be a string and cannot contain character `/`. Received: name=tf_distil_bert_for_question_answering_2/distilbert/embeddings/word_embeddings/weight_0_momentum

I am not sure either, I havent come across this either but I would suggest to check keras optimizers on ternsorflow and maybe there is a string format you can input for variables.

(optimizers/base_optimizer.py:269, in BaseOptimizer.apply_gradients(self, grads_and_vars)
267 def apply_gradients(self, grads_and_vars):