--
-- template: pg_mul_float
--
-- <DELAY=3>
--
-- <$w1>-bit floating-point multiplier
--
-- srca, srcb, dst
-- float:
-- 1     <$w1-$m1-1>            <$m1>
-- sign  exponent     mantissa
--
-- mantissa is passed to DSP w/o hidden-1 bit.  hidden-1 is handled by
-- this entity itself.  required number of DSP is always minimum, but
-- may be the implementation in pg_mul_float.hidden1.template.vhd is faster.
--
library ieee;
use ieee.std_logic_1164.all;
use ieee.std_logic_arith.all;
use ieee.std_logic_unsigned.all;

entity <$mname> is
  port (
    srca   : in  std_logic_vector(<$w1-1> downto 0);
    srcb   : in  std_logic_vector(<$w1-1> downto 0);
    dst    : out std_logic_vector(<$w1-1> downto 0);
    clk    : in  std_logic
  );
end <$mname>;

architecture rtl_dsp1 of <$mname> is

  component mul_int
    generic (
      WIDTH : integer;
      DELAY : integer
    );
    port (
      clock        : in  std_logic;
      dataa        : in  std_logic_vector(WIDTH-1 downto 0);
      datab        : in  std_logic_vector(WIDTH-1 downto 0);
      result       : out std_logic_vector(WIDTH*2-1 downto 0)
    );
  end component;

  component lpm_add_sub
    generic (
      LPM_WIDTH     : integer;
      LPM_DIRECTION : string
    );
    port (
      dataa  : in  std_logic_vector(LPM_WIDTH-1 downto 0);
      datab  : in  std_logic_vector(LPM_WIDTH-1 downto 0);
      cout   : out std_logic;
      result : out std_logic_vector(LPM_WIDTH-1 downto 0)
    );
  end component;

  component round_ubf1
    generic (
      IN_WIDTH   : integer;
      OUT_WIDTH  : integer
    );
    port (
      indata  : in  std_logic_vector(IN_WIDTH-1 downto 0);
      outdata : out std_logic_vector(OUT_WIDTH-1 downto 0)
    );
  end component;

  signal sx0, sy0 : std_logic;
  signal ex0, ey0 : std_logic_vector(<$w1-$m1-2> downto 0);  -- 6 downto 0
  signal mx0, my0 : std_logic_vector(<$m1-1> downto 0);  -- 8 downto 0

  signal s1, s2: std_logic;
  signal nz1, nz2, nz3 : std_logic;

  signal e0, e1, e2, ea2: std_logic_vector(<$w1-$m1-2> downto 0);

  signal mm1      : std_logic_vector(<$m1*2-1> downto 0);
  signal mma1     : std_logic_vector(<$m1-1> downto 0);
  signal mp0, mp1 : std_logic_vector(<$m1> downto 0);    -- one extra bit at MSB.  eg. 9 downto 0
  signal m1, m1a  : std_logic_vector(<$m1+1> downto 0);  -- two extra bits at MSB.  eg. 10 downto 0
  signal m1b      : std_logic_vector(<$m1> downto 0);
  signal ma2      : std_logic_vector(<$m1-1> downto 0);
  signal c2       : std_logic;
  
begin

  sx0 <= srca(<$w1-1>);                            -- sign
  ex0 <= srca(<$w1-2> downto <$m1>);                   -- exponent
  mx0 <= srca(<$m1-1> downto 0);                    -- mantissa
  
  sy0 <= srcb(<$w2-1>);                            -- sign
  ey0 <= srcb(<$w2-2> downto <$m2>);                   -- exponent
  my0 <= srcb(<$m2-1> downto 0);                    -- mantissa

  -- sign bit
  process (clk) begin 
    if (clk'event and clk='1') then
      s1 <= sx0 xor sy0;
      s2 <= s1;
    end if;
  end process;

  -- non-zero bit
  process (clk) begin
    if (clk'event and clk='1') then
      if (srca(<$w1-2> downto 0) = conv_std_logic_vector(0, <$w1-1>) or
          srcb(<$w2-2> downto 0) = conv_std_logic_vector(0, <$w2-1>)) then
        nz1 <= '0';
      else
        nz1 <= '1';
      end if;
    end if;
  end process;

  -- e0 <= ex0 + ey0
  u0: lpm_add_sub   
    generic map (
      LPM_WIDTH     => <$w1-$m1-1>,
      LPM_DIRECTION => "ADD"
    )
    port map(
      dataa  => ex0,
      datab  => ey0,
      result => e0
    );

  process (clk) begin
    if (clk'event and clk='1') then
      e1 <= e0 - ('1' & conv_std_logic_vector(0, <$w1 - $m1 - 2>));  -- remove offset
      e2 <= e1;
    end if;
  end process;

  -- mm1 <= mx0 * my0
  u1: mul_int
    generic map (
      WIDTH => <$m1>,
      DELAY => 1
    )
    port map (
      clock  => clk,
      result => mm1,
      dataa  => mx0,
      datab  => my0);

  -- force-1 rounding
  u2: round_ubf1
    generic map (
      IN_WIDTH  => <$m1*2>,
      OUT_WIDTH => <$m1>
    )
    port map (
      indata  => mm1,
      outdata => mma1
    );

  -- mp0 <= mx0 + my0
  u3: lpm_add_sub
    generic map (
      LPM_WIDTH     => <$m1>,
      LPM_DIRECTION => "ADD"
    )
    port map (
      dataa  => mx0,
      datab  => my0,
      cout   => mp0(<$m1>),
      result => mp0(<$m1-1> downto 0)
    );

  process (clk) begin
    if (clk'event and clk='1') then
      mp1 <= mp0;
    end if;
  end process;

  -- m1 <= mma1 + mp1
  u4: lpm_add_sub
    generic map (
      LPM_WIDTH     => <$m1+1>,
      LPM_DIRECTION => "ADD"
    )
    port map (
      dataa  => '0' & mma1,
      datab  => mp1,
      cout   => m1(<$m1+1>),
      result => m1(<$m1> downto 0)
    );

  -- add 1.0 to m1.
  u5: lpm_add_sub
    generic map (
      LPM_WIDTH     => <$m1+2>,
      LPM_DIRECTION => "ADD"
    )
    port map (
      dataa  => m1,
      datab  => "01" & conv_std_logic_vector(0, <$m1>),
      result => m1a
    );

  -- now m1a is in the range of 1.0 <= m1a < 4.0.
  --   01 mm..m : 1.0 <= m1a < 2.0
  --   10 mm..m : 2.0 <= m1a < 3.0
  --   11 mm..m : 3.0 <= m1a < 4.0

  --
  -- normalize m1a.
  -- 

  -- round LSB of m1a to obtain m1b,
  u6: round_ubf1
    generic map (
      IN_WIDTH  => <$m1 + 2>,
      OUT_WIDTH => <$m1 + 1>
    )
    port map (
      indata  => m1a,
      outdata => m1b
    );

  -- select m1a or m1b.
  process (clk) begin
    if (clk'event and clk='1') then
      if (m1a(<$m1 + 1>) = '1') then
        ma2(<$m1 - 1> downto 0) <= m1b(<$m1 - 1> downto 0);
      else
        ma2(<$m1 - 1> downto 0) <= m1a(<$m1 - 1> downto 0);
      end if;
    end if;
  end process;

  process (clk) begin
    if (clk'event and clk='1') then
      c2 <= m1a(<$m1 + 1>);
    end if;
  end process;

  -- exponent would be increased by 1.
  u7: lpm_add_sub
    generic map (
      LPM_WIDTH     => <$w1 - $m1 - 1>,
      LPM_DIRECTION => "ADD"
    )
    port map (
      dataa  => e2,
      datab  => conv_std_logic_vector(0, <$w1 -$m1 - 2>) & c2,
      result => ea2
    );

  process (clk) begin
    if (clk'event and clk='1') then
      nz2 <= nz1;
    end if;
  end process;

  process (clk) begin
    if (clk'event and clk='1') then
      if (nz2 = '1') then
        dst(<$w1 - 1>)          <= s2;
        dst(<$w1 - 2> downto <$m1>) <= ea2;
        dst(<$m1 - 1> downto 0)  <= ma2;
      else
        dst <= conv_std_logic_vector(0, <$w1>);
      end if;
    end if;
  end process;

end rtl_dsp1;
