diff --git a/hdl/data_serializer.vhd b/hdl/data_serializer.vhd
index 040fe00d2440b407e1b5b841578d06004ad5d072..6592884be8715602132cebe0127546f39f446df0 100644
--- a/hdl/data_serializer.vhd
+++ b/hdl/data_serializer.vhd
@@ -80,7 +80,7 @@ begin
                 if corrout_valid= '1' then
                     -- start on valid
                     run_serial <= '1';
-                    cnt <= to_unsigned(C_N_MM_PSC, cnt'length);
+                    cnt <= to_unsigned(C_N_MM_PSC-1, cnt'length);
 
                 end if;
 
diff --git a/hdl/matrix_mul.vhd b/hdl/matrix_mul.vhd
index dacbc77ca2be5ba58b4139cfcd292ca799e38a28..c410766cf68ae4bcdc0c481eed766cdafccc07b3 100644
--- a/hdl/matrix_mul.vhd
+++ b/hdl/matrix_mul.vhd
@@ -58,6 +58,7 @@ architecture rtl of matrix_mul is
 
     signal new_seq : std_logic;
     signal mul_done : std_logic;
+    signal r_mul_done : std_logic;
 
 begin
 
@@ -101,7 +102,9 @@ begin
         if rst_n = '0' then
             id_cnt <= (others => '1');
             mul_done <= '0';
+            r_mul_done <= '0';
         elsif rising_edge(clk) then
+            r_mul_done <= mul_done;
             if id_cnt = 0 then
                 id_cnt <= unsigned(id_cnt_load);
                 mul_done <= '1';
@@ -217,7 +220,7 @@ begin
     --------------------
     -- OUTPUT CONNECT --
     --------------------
-    matmult_tvalid   <= mul_done;
+    matmult_tvalid  <= r_mul_done;
     matmult_seq     <= r_seq;
 
 end architecture;