This is a 5 input carry save adder I made in an attempt to fit two full chains in an LX150.

I don't know if this helps you guys out but I don't have time to test it myself atm

download it here:

http://www.omegav.ntnu.no/~kamben/adder5x.vhdor copy paste this:

-- This block uses 94 LUTs with only 29 Carry chain LUTs. (sliceM/L) (implemented purely combinatorial, no regs )

-- XST synth of 4 or 5 input adder uses 64 LUTs with 64 carry chain LUTs. (sliceM/L) (implemented purely combinatorial, no regs )

LIBRARY IEEE;

USE ieee.std_logic_1164.ALL;

USE ieee.std_logic_unsigned."+";

--Library UNISIM;

--use UNISIM.vcomponents.all;

ENTITY adder5x IS PORT (

reset : IN std_logic;

clk : IN std_logic;

ina : IN std_logic_vector(31 downto 0);

inb : IN std_logic_vector(31 downto 0);

inc : IN std_logic_vector(31 downto 0);

ind : IN std_logic_vector(31 downto 0);

ine : IN std_logic_vector(31 downto 0);

qout : OUT std_logic_vector(31 downto 0));

END adder5x;

ARCHITECTURE rtl OF adder5x IS

SIGNAL a: std_logic_vector(31 downto 0);

SIGNAL b: std_logic_vector(31 downto 0);

SIGNAL c: std_logic_vector(31 downto 0);

SIGNAL d: std_logic_vector(31 downto 0);

SIGNAL e: std_logic_vector(31 downto 0);

SIGNAL qr: std_logic_vector(31 downto 0);

--

SIGNAL SA,SAr :std_logic_vector(31 downto 0);

SIGNAL SB,SBr :std_logic_vector(31 downto 2);

SIGNAL S1,S2,S3 :std_logic_vector(31 downto 0);

--SIGNAL fasit : std_logic_vector(31 downto 0);

BEGIN

-- input_reg: PROCESS (reset, clk)

--BEGIN

-- IF (clk'event AND clk='1') THEN

a<=ina;

b<=inb;

c<=inc;

d<=ind;

e<=ine;

-- END IF;

--END PROCESS;

-- pipe_reg: PROCESS (reset, clk)

--BEGIN

-- IF (clk'event AND clk='1') THEN

SAr<=SA; -- if your whole "chain" only has 1 pipeline register

SBr<=SB; -- this might be a good place to put it

-- END IF;

--END PROCESS;

-- output_reg: PROCESS (reset, clk)

--BEGIN

-- IF (clk'event AND clk='1') THEN

qr<=SAr+(SBr & "00"); -- Regular carry chain adder for the last stage

-- END IF;

--END PROCESS;

qout<=qr;

--fasit<=a+b+c+d+e;

------------

--calc

-- first LUT column of adder

-- 5 single bit inputs -> 3 bit sum output

LUT_stage1:FOR i IN 0 TO 31 GENERATE

---------

S1(i)<=a(i) XOR b(i) XOR c(i) XOR d(i) XOR e(i);

-----

-- forced LUT alternative. slightly faster, uses more overall LUTs

-- could save 1 sliceM/L for every 2 adder blocks. Might make routing easier.

--LUT5_inst1a : LUT5

--generic map (

--INIT => x"96696996")

--port map (

--O => S1(i),

--I0 => a(i),

--I1 => b(i),

--I2 => c(i),

--I3 => d(i),

--I4 => e(i));

-----

---------

LUT_inst1bc : LUT6_2

generic map (

INIT => x"E8808000177E7EE8")

port map (

O6 => S3(i),

O5 => S2(i),

I0 => a(i),

I1 => b(i),

I2 => c(i),

I3 => d(i),

I4 => e(i),

I5 => '1');

END GENERATE;

-- 2x3bit LUT sums -> 2+2bit output sum

-- max sum = 5+(2*5)=15, range 0-15 -> exact 4 bit

LUT_stage2A:FOR i IN 0 TO 15 GENERATE

SA((i*2))<=S1((i*2));

SA((i*2)+1)<=S2((i*2)) XOR S1((i*2)+1);

END GENERATE;

--SB(0)<='0';

--SB(1)<='0';

LUT_stage2B:FOR i IN 0 TO 14 GENERATE

LUT_inst2cd : LUT6_2

generic map (

INIT => x"0077640000641364")

port map (

O6 => SB((i*2)+3),

O5 => SB((i*2)+2),

I0 => S2((i*2)), -- B1

I1 => S3((i*2)), -- C1

I2 => S1((i*2)+1), -- A2

I3 => S2((i*2)+1), -- B2

I4 => S3((i*2)+1), -- C2

I5 => '1'); --

END GENERATE;

END rtl;