--
-- template: pg_mul_float
--
-- <DELAY=3>
--
-- <$w1>-bit floating-point multiplier
--
-- srca, srcb, dst
-- float:
-- 1     <$w1-$m1-1>            <$m1>
-- sign  exponent     mantissa
--
-- In this implementation (A), mantissa is passed to DSP with hidden-1
-- bit. this is likely to be faster than the implementation B,
-- described in pg_mul_float.nohidden1.template.vhd.
--
-- On the other hand, A requires one more DSP than B, if mantissa bit
-- width is exactly 9n (here n is a natural number). However, such
-- a case happens not very often.
-- 
--
library ieee;
use ieee.std_logic_1164.all;
use ieee.std_logic_arith.all;
use ieee.std_logic_unsigned.all;

entity <$mname> is
  port (
    srca   : in  std_logic_vector(<$w1-1> downto 0);
    srcb   : in  std_logic_vector(<$w1-1> downto 0);
    dst    : out std_logic_vector(<$w1-1> downto 0);
    clk    : in  std_logic
  );
end <$mname>;

architecture rtl_dsp2 of <$mname> is

  component mul_int
    generic (
      WIDTH : integer;
      DELAY : integer
    );
    port (
      clock        : in  std_logic;
      dataa        : in  std_logic_vector(WIDTH-1 downto 0);
      datab        : in  std_logic_vector(WIDTH-1 downto 0);
      result       : out std_logic_vector(WIDTH*2-1 downto 0)
    );
  end component;

  component lpm_add_sub
    generic (
      LPM_WIDTH     : integer;
      LPM_DIRECTION : string
    );
    port (
      dataa  : in  std_logic_vector(LPM_WIDTH-1 downto 0);
      datab  : in  std_logic_vector(LPM_WIDTH-1 downto 0);
      cout   : out std_logic;
      result : out std_logic_vector(LPM_WIDTH-1 downto 0)
    );
  end component;

  component round_ubf1
    generic (
      IN_WIDTH   : integer;
      OUT_WIDTH  : integer
    );
    port (
      indata  : in  std_logic_vector(IN_WIDTH-1 downto 0);
      outdata : out std_logic_vector(OUT_WIDTH-1 downto 0)
    );
  end component;

  signal sx0, sy0 : std_logic;
  signal ex0, ey0 : std_logic_vector(<$w1-$m1-2> downto 0);
  signal mx0, my0 : std_logic_vector(<$m1-1> downto 0);

  signal s1, s2, s3    : std_logic;
  signal nz1, nz2, nz3 : std_logic;

  signal e0, e1, e2, ea2, e3 : std_logic_vector(<$w1-$m1-2> downto 0);

  signal mm2     : std_logic_vector(<$m1*2+1> downto 0);  -- (mantissa + hidden-1) * 2 bits.
  signal m3      : std_logic_vector(<$m1-1> downto 0);
  
begin

  sx0 <= srca(<$w1-1>);                            -- sign
  ex0 <= srca(<$w1-2> downto <$m1>);                   -- exponent
  mx0 <= srca(<$m1-1> downto 0);                    -- mantissa
  
  sy0 <= srcb(<$w2-1>);                            -- sign
  ey0 <= srcb(<$w2-2> downto <$m2>);                   -- exponent
  my0 <= srcb(<$m2-1> downto 0);                    -- mantissa

  -- sign bit
  process (clk) begin
    if (clk'event and clk='1') then
      s1 <= sx0 xor sy0;
      s2 <= s1;
      s3 <= s2;
    end if;
  end process;

  -- non-zero bit
  process (clk) begin
    if (clk'event and clk='1') then
      if (srca(<$w1-2> downto 0) = conv_std_logic_vector(0, <$w1-1>) or
          srcb(<$w2-2> downto 0) = conv_std_logic_vector(0, <$w2-1>)) then
        nz1 <= '0';
      else
        nz1 <= '1';
      end if;
    end if;
  end process;

  u0: lpm_add_sub                       -- e0 <= ex0 + ey0
    generic map (LPM_WIDTH     => <$w1-$m1-1>,
                 LPM_DIRECTION => "ADD")
    port map(result => e0,
             dataa  => ex0,
             datab  => ey0);

  -- mm2 <= mx0 * my0
  -- pipeline delay: 2
  u1: mul_int               -- mm2 <= mx0 * my0
    generic map (
      WIDTH => <$m1+1>,
      DELAY => 2
    )
    port map (
      dataa  => '1' & mx0,
      datab  => '1' & my0,
      result => mm2,
      clock  => clk
    );

  -- normalize mm2
  process (clk) begin
    if (clk'event and clk='1') then
      if (mm2(<$m1*2+1>) = '0') then
        m3(<$m1-1> downto 1) <= mm2(<$m1*2-1> downto <$m1+1>);
        if (mm2(<$m1> downto 0) = conv_std_logic_vector(0, <$m1+1>)) then  -- biased force-1
          m3(0) <= '0';
        else
          m3(0) <= '1';
        end if;
      else
        m3(<$m1-1> downto 1) <= mm2(<$m1*2> downto <$m1+2>);
        if (mm2(<$m1+1> downto 0) = conv_std_logic_vector(0, <$m1+2>)) then  -- biased force-1
          m3(0) <= '0';
        else
          m3(0) <= '1';
        end if;
      end if;
    end if;
  end process;

  -- exponent would be increased by 1.
  u5: lpm_add_sub
    generic map (
      LPM_WIDTH     => <$w1 - $m1 - 1>,
      LPM_DIRECTION => "ADD"
    )
    port map (
      dataa  => e2,
      datab  => conv_std_logic_vector(0, <$w1 -$m1 - 2>) & mm2(<$m1*2+1>),
      result => ea2
    );

  process (clk) begin
    if (clk'event and clk='1') then
      e1 <= e0 - ('1' & conv_std_logic_vector(0, <$w1 - $m1 - 2>));  -- remove offset
      e2 <= e1;
      e3 <= ea2;
    end if;
  end process;

  process (clk) begin
    if (clk'event and clk='1') then
      nz2 <= nz1;
      nz3 <= nz2;
    end if;
  end process;

  with nz3 select
    dst <= s3 & e3 & m3 when '1',
    conv_std_logic_vector(0, <$w1>) when others;

end rtl_dsp2;

