This is a 5 input carry save adder I made in an attempt to fit two full chains in an LX150.
I don't know if this helps you guys out but I don't have time to test it myself atm
download it here:
http://www.omegav.ntnu.no/~kamben/adder5x.vhdor copy paste this:
-- This block uses 94 LUTs with only 29 Carry chain LUTs. (sliceM/L) (implemented purely combinatorial, no regs )
-- XST synth of 4 or 5 input adder uses 64 LUTs with 64 carry chain LUTs. (sliceM/L) (implemented purely combinatorial, no regs )
LIBRARY IEEE;
USE ieee.std_logic_1164.ALL;
USE ieee.std_logic_unsigned."+";
--Library UNISIM;
--use UNISIM.vcomponents.all;
ENTITY adder5x IS PORT (
reset : IN std_logic;
clk : IN std_logic;
ina : IN std_logic_vector(31 downto 0);
inb : IN std_logic_vector(31 downto 0);
inc : IN std_logic_vector(31 downto 0);
ind : IN std_logic_vector(31 downto 0);
ine : IN std_logic_vector(31 downto 0);
qout : OUT std_logic_vector(31 downto 0));
END adder5x;
ARCHITECTURE rtl OF adder5x IS
SIGNAL a: std_logic_vector(31 downto 0);
SIGNAL b: std_logic_vector(31 downto 0);
SIGNAL c: std_logic_vector(31 downto 0);
SIGNAL d: std_logic_vector(31 downto 0);
SIGNAL e: std_logic_vector(31 downto 0);
SIGNAL qr: std_logic_vector(31 downto 0);
--
SIGNAL SA,SAr :std_logic_vector(31 downto 0);
SIGNAL SB,SBr :std_logic_vector(31 downto 2);
SIGNAL S1,S2,S3 :std_logic_vector(31 downto 0);
--SIGNAL fasit : std_logic_vector(31 downto 0);
BEGIN
-- input_reg: PROCESS (reset, clk)
--BEGIN
-- IF (clk'event AND clk='1') THEN
a<=ina;
b<=inb;
c<=inc;
d<=ind;
e<=ine;
-- END IF;
--END PROCESS;
-- pipe_reg: PROCESS (reset, clk)
--BEGIN
-- IF (clk'event AND clk='1') THEN
SAr<=SA; -- if your whole "chain" only has 1 pipeline register
SBr<=SB; -- this might be a good place to put it
-- END IF;
--END PROCESS;
-- output_reg: PROCESS (reset, clk)
--BEGIN
-- IF (clk'event AND clk='1') THEN
qr<=SAr+(SBr & "00"); -- Regular carry chain adder for the last stage
-- END IF;
--END PROCESS;
qout<=qr;
--fasit<=a+b+c+d+e;
------------
--calc
-- first LUT column of adder
-- 5 single bit inputs -> 3 bit sum output
LUT_stage1:FOR i IN 0 TO 31 GENERATE
---------
S1(i)<=a(i) XOR b(i) XOR c(i) XOR d(i) XOR e(i);
-----
-- forced LUT alternative. slightly faster, uses more overall LUTs
-- could save 1 sliceM/L for every 2 adder blocks. Might make routing easier.
--LUT5_inst1a : LUT5
--generic map (
--INIT => x"96696996")
--port map (
--O => S1(i),
--I0 => a(i),
--I1 => b(i),
--I2 => c(i),
--I3 => d(i),
--I4 => e(i));
-----
---------
LUT_inst1bc : LUT6_2
generic map (
INIT => x"E8808000177E7EE8")
port map (
O6 => S3(i),
O5 => S2(i),
I0 => a(i),
I1 => b(i),
I2 => c(i),
I3 => d(i),
I4 => e(i),
I5 => '1');
END GENERATE;
-- 2x3bit LUT sums -> 2+2bit output sum
-- max sum = 5+(2*5)=15, range 0-15 -> exact 4 bit
LUT_stage2A:FOR i IN 0 TO 15 GENERATE
SA((i*2))<=S1((i*2));
SA((i*2)+1)<=S2((i*2)) XOR S1((i*2)+1);
END GENERATE;
--SB(0)<='0';
--SB(1)<='0';
LUT_stage2B:FOR i IN 0 TO 14 GENERATE
LUT_inst2cd : LUT6_2
generic map (
INIT => x"0077640000641364")
port map (
O6 => SB((i*2)+3),
O5 => SB((i*2)+2),
I0 => S2((i*2)), -- B1
I1 => S3((i*2)), -- C1
I2 => S1((i*2)+1), -- A2
I3 => S2((i*2)+1), -- B2
I4 => S3((i*2)+1), -- C2
I5 => '1'); --
END GENERATE;
END rtl;