--------------------------------------------------------------------------------
--! @file
--! @brief pp_fir_filter.
--!        This implements a poly-phase fir filter that can be used for
--!        rational resampling or rational sample delay.
--!        The taps of the FIR filter are generated at compile time and start
--!        as a Hann-windowed sinc function.  0-phase offset is then normalized
--!        to be 0.98 amplitude.
--!        The generics determine the resolution of the fir-filter, as well as
--!        as the number of phases.
--------------------------------------------------------------------------------
library ieee;
    use ieee.std_logic_1164.all;
    use ieee.numeric_std.all;
    use ieee.math_real.all;
library work;
    use work.er_pack.all;

entity pp_fir_filter is
    generic (
        --! The width of each tap in bits
        taps_width_g    : natural :=  16;
        --! The number of lobes.  This is basically the number of taps per filter
        num_lobes_g     : natural :=   8;
        --! The number of parallel channels
        num_channels_g  : natural :=   1;
        --! The number of taps per lobe
        taps_per_lobe_g : natural := 512;
        --! The number of taps to skip to get to the next tap
        step_size_g     : natural := 512);
    port (
        -- standard ports
        clk_i : in  std_logic;
        rst_i : in  std_logic;

        -- input data ports
        --! Run the filter without taking another sample
        run_i     : in  std_logic;
        phase_i   : in  std_logic_vector(log2(taps_per_lobe_g) downto 0);
        data_en_i : in  std_logic;
        data_i    : in  std_logic_vector(num_channels_g*taps_width_g-1 downto 0);

        -- output data ports
        data_o    : out std_logic_vector(num_channels_g*taps_width_g-1 downto 0);
        data_en_o : out std_logic);
end entity pp_fir_filter;

architecture behavior of pp_fir_filter is
    ----------------------------------------------------------------------------
    -- Types, Subtypes, and Constants
    ----------------------------------------------------------------------------
    subtype word_t  is signed(1*taps_width_g-1 downto 0);
    subtype dword_t is signed(2*taps_width_g-1 downto 0);
    subtype save_range is natural range 2*taps_width_g-2 downto 1*taps_width_g-1;
    type word_vector_t  is array (integer range <>) of word_t;
    type dword_vector_t is array (integer range <>) of dword_t;
    type rom_t          is array (integer range <>) of signed(data_i'range);

    -- The state machine deals with the MACCs
    type state_type is (
        idle_state,  -- Waiting for input signal
        load_state,  -- Load the sample into the input ram
        mult_state,  -- First multiply does not accumulate product
        macc_state,  -- P += A*B
        save_state); -- Save the output
    type dsp_opcode_type is (
        clear,       -- P  = 0
        mult,        -- P  = A*B
        macc,        -- P += A*B
        hold);       -- P  = P
    constant round_val  : dword_t := shift_left(to_signed(1, dword_t'length), taps_width_g-2);

    -- We want the phase offset to be in relation to the middle of the center
    -- lobe.  For this reason, we will need to determine the offset of the first
    -- sample in relation to the step_size, taps_per_lobe, and the number of
    -- lobes
    constant phase_offset_c : natural :=
--      (num_lobes_g * (taps_per_lobe_g - step_size_g+1)) mod taps_per_lobe_g;
        (num_lobes_g/2 * (taps_per_lobe_g - step_size_g));
    constant num_regs_c : natural :=
--      (num_lobes_g * (taps_per_lobe_g / step_size_g));
        (num_lobes_g);

    ----------------------------------------------------------------------------
    -- functions
    ----------------------------------------------------------------------------
    function load_sinc_rom (
        taps_per_lobe : natural;
        num_lobes     : natural)
    return word_vector_t is
        -- The returned ram
        variable rom      : word_vector_t(0 to taps_per_lobe * num_lobes-1);

        -- Stuff for the actual sinc calculation
        variable real_rom : real_vector(rom'range);
        variable half     : real := real(rom'length/2);
        variable nm1      : real := real(rom'length-1);
        variable phase    : real;
        variable sinc     : real;
        variable hann     : real;

        -- for power calculation
        variable power : real;
    begin
        ------------------------------------------------------------------------
        -- Tap generation
        ------------------------------------------------------------------------
        for idx in real_rom'range loop
            -- Determine the phase, but multiply it by PI to get the correct
            -- phase shift
            phase := math_pi * (real(idx) - half) / real(taps_per_lobe);

            -- Don't divide by zero
            if phase = 0.0 then
                sinc := 1.0;
            else
                sinc := sin(phase) / phase;
            end if;

            -- Multiply it by a hann window
            hann := 0.5 * (1.0 - cos(2.0*math_pi*real(idx)/nm1));

            -- Put it in the rom
            real_rom(idx) := sinc*hann;
        end loop;

        ------------------------------------------------------------------------
        -- Energy measurement
        ------------------------------------------------------------------------
        -- Now that the ram is complete, we still need to make sure that we
        -- scale everything to be a power of one.  This is to make sure that we
        -- don't overflow during the actual addition.
        power := 0.0;
        for idx in 0 to num_regs_c-1 loop
            power := power + real_rom(phase_offset_c + idx*step_size_g);
        end loop;

        ------------------------------------------------------------------------
        -- Normalization
        ------------------------------------------------------------------------
        -- Now put it in the actual ram
        for idx in rom'range loop
            real_rom(idx) := real_rom(idx) * (0.98 / power);
            rom     (idx) := signed(to_slv(real_rom(idx), word_t'length));
        end loop;

        -- return it
        return rom;
    end function load_sinc_rom;

    -----------------------------------------------------------------------------
    constant taps_rom : word_vector_t := load_sinc_rom(taps_per_lobe_g, num_lobes_g);

    ----------------------------------------------------------------------------
    -- Signals
    ----------------------------------------------------------------------------
    signal phase_reg : natural;
    signal data_reg  : std_logic_vector(data_i'range);

    signal state      : state_type;
    signal dsp_opcode : dsp_opcode_type;

    -- DSP Signals
    signal a : word_vector_t (0 to num_channels_g-1);
    signal b : word_t;
    signal p : dword_vector_t(0 to num_channels_g-1);
    signal r : word_vector_t (0 to num_channels_g-1);

    -- RAM/ROM Signals
    signal taps_addr      : natural;
    signal next_taps_addr : natural;
    signal z_addr         : natural;
    signal z_ram          : rom_t(0 to num_regs_c-1);
    signal z_ram_en       : std_logic;

    -- Quantization signals
    signal q : dword_vector_t(0 to num_channels_g-1);

    -- for internal testing
    signal rom_data_test : word_t;
    signal rom_addr_test : natural;

--------------------------------------------------------------------------------
begin
--------------------------------------------------------------------------------
    -- The actual fir filter part
    -----------------------------------------------------------------------------
    -- Direct signal assignments
    -----------------------------------------------------------------------------
    a_gen : for idx in 0 to num_channels_g-1 generate
        -- Get the input for the multiplication
        a(idx) <= z_ram(z_addr)((idx+1)*taps_width_g-1 downto idx*taps_width_g);

        -- Since the rounding is combinational, we can sum it up here
        q(idx) <= p(idx) + round_val;

        -- Now the data out
        data_o((idx+1)*taps_width_g-1 downto idx*taps_width_g) <=
            std_logic_vector(r(idx));
    end generate a_gen;

    -- This one is easy
    b <= taps_rom(taps_addr);      -- Select MUX

    -----------------------------------------------------------------------------
    -- FIR process controls the main state machine behind the serial FIR
    -----------------------------------------------------------------------------
    fsm_proc : process(clk_i)
        variable idx_hi : natural;
        variable idx_lo : natural;
    begin
        if rising_edge(clk_i) then
            if rst_i = '1' then
                state          <= idle_state;
                dsp_opcode     <= clear;
                z_ram_en       <= '0';
                z_addr         <=  0 ;
                taps_addr      <=  0 ;
                next_taps_addr <=  0 ;
                data_en_o      <= '0';
--              data_o         <= (others => '0');
            else
                -- Default cases
                z_ram_en  <= '0';
                data_en_o <= '0';
                next_taps_addr <= next_taps_addr + step_size_g;

                -- Other cases
                case state is
                    -----------------------------------------------------------------
                    when idle_state =>
                        dsp_opcode <= clear;
                        z_addr     <=  0 ;
                        taps_addr  <=  0 ;
                        if data_en_i = '1' or run_i = '1' then
                            z_ram_en  <= data_en_i;
                            state     <= load_state;
                            phase_reg <= phase_offset_c + to_integer(unsigned(phase_i));
                            data_reg  <= data_i;
                        end if;
                    -----------------------------------------------------------------
                    when load_state =>
                        dsp_opcode     <= clear;
                        z_addr         <=  0 ;
                        taps_addr      <= phase_reg;
                        next_taps_addr <= phase_reg;
                        state          <= mult_state;
                    -----------------------------------------------------------------
                    when mult_state =>
                        dsp_opcode <= mult;
                        z_addr     <=  0 ;
                        taps_addr  <= phase_reg;
                        state      <= macc_state;
                    -----------------------------------------------------------------
                    when macc_state =>
                        dsp_opcode <= macc;

                        -- The delayed version of the incoming signal
--                      if next_taps_addr >= taps_rom'length then
                        if z_addr = z_ram'high then
                            state <= save_state;
                        else
                            z_addr    <= z_addr + 1;
                            taps_addr <= next_taps_addr;
                        end if;
                    -----------------------------------------------------------------
                    when save_state =>
                        dsp_opcode <= macc;
                        z_addr     <=  0 ;
                        data_en_o  <= '1';
                        state      <= idle_state;
                        for idx in q'range loop
                            r(idx) <= q(idx)(save_range);
                        end loop;
                    -----------------------------------------------------------------
                end case;
            end if;
        end if;
    end process fsm_proc;

    -----------------------------------------------------------------------------
    -- DSP48 process emulates a DSP48 (partially)
    -----------------------------------------------------------------------------
    alu_proc : process(clk_i)
    begin
        if rising_edge(clk_i) then
            if rst_i = '1' then
                p <= (others => (others => '0'));
            else
                case dsp_opcode is
                    ------------------------------------------------------------
                    when clear =>
                        p <= (others => (others => '0'));
                    ------------------------------------------------------------
                    when mult =>
                        for idx in p'range loop
                            p(idx) <= a(idx) * b;
                        end loop;
                    ------------------------------------------------------------
                    when macc =>
                        for idx in p'range loop
                            p(idx) <= p(idx) + a(idx) * b;
                        end loop;
                    ------------------------------------------------------------
                    when hold =>
                        null;
                    ------------------------------------------------------------
               end case;
           end if;
        end if;
    end process alu_proc;

    -----------------------------------------------------------------------------
    -- Shift RAM
    -----------------------------------------------------------------------------
    -- I'm calling it the z ram, since it is the z delay of the incoming signal
    shift_ram_proc : process(clk_i)
    begin
        if rising_edge(clk_i) then
            if rst_i = '1' then
                z_ram <= (others => (others => '0'));
            elsif z_ram_en = '1' then
                z_ram <= signed(data_reg) & z_ram(0 to z_ram'length-2);
            end if;
        end if;
    end process shift_ram_proc;

    ----------------------------------------------------------------------------
    -- tests
    ----------------------------------------------------------------------------
    -- synthesis off
    -- Test the rom by iterating through the rom
    rom_test_proc : process(clk_i)
    begin
        if rising_edge(clk_i) then
            if rst_i = '1' then
                rom_addr_test <= 0;
            else
                if rom_addr_test >= taps_rom'length-1 then
                    rom_addr_test <= 0;
                else
                    rom_addr_test <= rom_addr_test + 1;
                end if;
            end if;
        end if;
    end process rom_test_proc;

    -- combinational read
    rom_data_test <= taps_rom(rom_addr_test);
    -- synthesis on

end architecture behavior;