Getting ValueError: Dimensions must be equal, but are 128 and 1024 when finetuning BART for summarization

I am trying to finetune BART on my custom dataset for summarization. I am using tensorflow version of Bart-Large-CNN. I have tokenized the data using the tokenizer of the same checkpoint. The max_input_length is 1024 and max_target_length is 128. I get the following error:

ValueError                                Traceback (most recent call last)
<ipython-input-13-bea8e09e9af3> in <module>
     11                                         push_to_hub=True,
     12                                         huggingface_hub_token=huggingface_hub_token,
---> 13                                         hub_name="bart-large_cnn-ft_scitldr",
     14                                         #run_opts=run_opts
     15                                        )

<ipython-input-8-2aad6d2e6f0d> in finetune_and_eval_model(model_checkpoint, tokenizer_name, tf_tokenized_train_ds, tf_tokenized_valid_ds, tf_tokenized_test_ds, dataset_name, Config, strategy, tf_data_size_info, run_opts, wandb_project_name, wandb_run_name, huggingface_hub_token, push_to_hub, hub_name, take_datasets)
    145                       validation_data=tf_tokenized_valid_ds,
    146                       epochs=1,
--> 147                       callbacks=[WandbCallback()]
    148                      )
    149 

/opt/conda/lib/python3.7/site-packages/wandb/integration/keras/keras.py in new_v2(*args, **kwargs)
    122             for cbk in cbks:
    123                 set_wandb_attrs(cbk, val_data)
--> 124         return old_v2(*args, **kwargs)
    125 
    126     training_arrays.orig_fit_loop = old_arrays

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1098                 _r=1):
   1099               callbacks.on_train_batch_begin(step)
-> 1100               tmp_logs = self.train_function(iterator)
   1101               if data_handler.should_sync:
   1102                 context.async_wait()

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
    826     tracing_count = self.experimental_get_tracing_count()
    827     with trace.Trace(self._name) as tm:
--> 828       result = self._call(*args, **kwds)
    829       compiler = "xla" if self._experimental_compile else "nonXla"
    830       new_tracing_count = self.experimental_get_tracing_count()

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
    869       # This is the first call of __call__, so we have to initialize.
    870       initializers = []
--> 871       self._initialize(args, kwds, add_initializers_to=initializers)
    872     finally:
    873       # At this point we know that the initialization is complete (or less

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
    724     self._concrete_stateful_fn = (
    725         self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
--> 726             *args, **kwds))
    727 
    728     def invalid_creator_scope(*unused_args, **unused_kwds):

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
   2967       args, kwargs = None, None
   2968     with self._lock:
-> 2969       graph_function, _ = self._maybe_define_function(args, kwargs)
   2970     return graph_function
   2971 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
   3359 
   3360           self._function_cache.missed.add(call_context_key)
-> 3361           graph_function = self._create_graph_function(args, kwargs)
   3362           self._function_cache.primary[cache_key] = graph_function
   3363 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
   3204             arg_names=arg_names,
   3205             override_flat_arg_shapes=override_flat_arg_shapes,
-> 3206             capture_by_value=self._capture_by_value),
   3207         self._function_attributes,
   3208         function_spec=self.function_spec,

/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
    988         _, original_func = tf_decorator.unwrap(python_func)
    989 
--> 990       func_outputs = python_func(*func_args, **func_kwargs)
    991 
    992       # invariant: `func_outputs` contains only Tensors, CompositeTensors,

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
    632             xla_context.Exit()
    633         else:
--> 634           out = weak_wrapped_fn().__wrapped__(*args, **kwds)
    635         return out
    636 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
    975           except Exception as e:  # pylint:disable=broad-except
    976             if hasattr(e, "ag_error_metadata"):
--> 977               raise e.ag_error_metadata.to_exception(e)
    978             else:
    979               raise

ValueError: in user code:

    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /opt/conda/lib/python3.7/site-packages/transformers/models/bart/modeling_tf_bart.py:1392 call  *
        outputs = self.model(
    /opt/conda/lib/python3.7/site-packages/transformers/models/bart/modeling_tf_bart.py:1145 call  *
        decoder_outputs = self.decoder(
    /opt/conda/lib/python3.7/site-packages/transformers/models/bart/modeling_tf_bart.py:984 call  *
        hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer(
    /opt/conda/lib/python3.7/site-packages/transformers/models/bart/modeling_tf_bart.py:409 call  *
        hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
    /opt/conda/lib/python3.7/site-packages/transformers/models/bart/modeling_tf_bart.py:236 call  *
        attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:1180 binary_op_wrapper
        raise e
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:1164 binary_op_wrapper
        return func(x, y, name=name)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:1486 _add_dispatch
        return gen_math_ops.add_v2(x, y, name=name)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/gen_math_ops.py:482 add_v2
        "AddV2", x=x, y=y, name=name)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py:750 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py:592 _create_op_internal
        compute_device)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:3536 _create_op_internal
        op_def=op_def)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:2016 __init__
        control_input_ops, op_def)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:1856 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimensions must be equal, but are 128 and 1024 for '{{node tf_bart_for_conditional_generation/model/decoder/layers.0/encoder_attn/add}} = AddV2[T=DT_FLOAT](tf_bart_for_conditional_generation/model/decoder/layers.0/encoder_attn/Reshape_6, tf_bart_for_conditional_generation/model/decoder/mul_3)' with input shapes: [128,16,1,8], [1024,1,1,1].

Please help me solve this