14 files changed, 395 insertions, 197 deletions
diff --git a/backends/cxxrtl/cxxrtl.h b/backends/cxxrtl/cxxrtl.h
index ce21cc1e6..85f45ac7f 100644
--- a/backends/cxxrtl/cxxrtl.h
+++ b/backends/cxxrtl/cxxrtl.h
@@ -17,6 +17,11 @@
  */
 
 // This file is included by the designs generated with `write_cxxrtl`. It is not used in Yosys itself.
+//
+// The CXXRTL support library implements compile time specialized arbitrary width arithmetics, as well as provides
+// composite lvalues made out of bit slices and concatenations of lvalues. This allows the `write_cxxrtl` pass
+// to perform a straightforward translation of RTLIL structures to readable C++, relying on the C++ compiler
+// to unwrap the abstraction and generate efficient code.
 
 #ifndef CXXRTL_H
 #define CXXRTL_H
@@ -35,10 +40,19 @@
 
 #include <backends/cxxrtl/cxxrtl_capi.h>
 
-// The CXXRTL support library implements compile time specialized arbitrary width arithmetics, as well as provides
-// composite lvalues made out of bit slices and concatenations of lvalues. This allows the `write_cxxrtl` pass
-// to perform a straightforward translation of RTLIL structures to readable C++, relying on the C++ compiler
-// to unwrap the abstraction and generate efficient code.
+// CXXRTL essentially uses the C++ compiler as a hygienic macro engine that feeds an instruction selector.
+// It generates a lot of specialized template functions with relatively large bodies that, when inlined
+// into the caller and (for those with loops) unrolled, often expose many new optimization opportunities.
+// Because of this, most of the CXXRTL runtime must be always inlined for best performance.
+#ifndef __has_attribute
+#	define __has_attribute(x) 0
+#endif
+#if __has_attribute(always_inline)
+#define CXXRTL_ALWAYS_INLINE inline __attribute__((__always_inline__))
+#else
+#define CXXRTL_ALWAYS_INLINE inline
+#endif
+
 namespace cxxrtl {
 
 // All arbitrary-width values in CXXRTL are backed by arrays of unsigned integers called chunks. The chunk size
@@ -52,6 +66,7 @@ namespace cxxrtl {
 // Therefore, using relatively wide chunks and clearing the high bits explicitly and only when we know they may be
 // clobbered results in simpler generated code.
 typedef uint32_t chunk_t;
+typedef uint64_t wide_chunk_t;
 
 template<typename T>
 struct chunk_traits {
@@ -85,6 +100,7 @@ struct value : public expr_base<value<Bits>> {
 	value<Bits> &operator=(const value<Bits> &) = default;
 
 	// A (no-op) helper that forces the cast to value<>.
+	CXXRTL_ALWAYS_INLINE
 	const value<Bits> &val() const {
 		return *this;
 	}
@@ -101,6 +117,7 @@ struct value : public expr_base<value<Bits>> {
 	// The trunc, zext and sext operations add or remove most significant bits (i.e. on the left);
 	// the rtrunc and rzext operations add or remove least significant bits (i.e. on the right).
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> trunc() const {
 		static_assert(NewBits <= Bits, "trunc() may not increase width");
 		value<NewBits> result;
@@ -111,6 +128,7 @@ struct value : public expr_base<value<Bits>> {
 	}
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> zext() const {
 		static_assert(NewBits >= Bits, "zext() may not decrease width");
 		value<NewBits> result;
@@ -120,6 +138,7 @@ struct value : public expr_base<value<Bits>> {
 	}
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> sext() const {
 		static_assert(NewBits >= Bits, "sext() may not decrease width");
 		value<NewBits> result;
@@ -135,6 +154,7 @@ struct value : public expr_base<value<Bits>> {
 	}
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> rtrunc() const {
 		static_assert(NewBits <= Bits, "rtrunc() may not increase width");
 		value<NewBits> result;
@@ -154,6 +174,7 @@ struct value : public expr_base<value<Bits>> {
 	}
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> rzext() const {
 		static_assert(NewBits >= Bits, "rzext() may not decrease width");
 		value<NewBits> result;
@@ -165,13 +186,14 @@ struct value : public expr_base<value<Bits>> {
 			carry = (shift_bits == 0) ? 0
 				: data[n] >> (chunk::bits - shift_bits);
 		}
-		if (carry != 0)
-			result.data[result.chunks - 1] = carry;
+		if (shift_chunks + chunks < result.chunks)
+			result.data[shift_chunks + chunks] = carry;
 		return result;
 	}
 
 	// Bit blit operation, i.e. a partial read-modify-write.
 	template<size_t Stop, size_t Start>
+	CXXRTL_ALWAYS_INLINE
 	value<Bits> blit(const value<Stop - Start + 1> &source) const {
 		static_assert(Stop >= Start, "blit() may not reverse bit order");
 		constexpr chunk::type start_mask = ~(chunk::mask << (Start % chunk::bits));
@@ -196,6 +218,7 @@ struct value : public expr_base<value<Bits>> {
 	// than the operand. In C++17 these can be replaced with `if constexpr`.
 	template<size_t NewBits, typename = void>
 	struct zext_cast {
+		CXXRTL_ALWAYS_INLINE
 		value<NewBits> operator()(const value<Bits> &val) {
 			return val.template zext<NewBits>();
 		}
@@ -203,6 +226,7 @@ struct value : public expr_base<value<Bits>> {
 
 	template<size_t NewBits>
 	struct zext_cast<NewBits, typename std::enable_if<(NewBits < Bits)>::type> {
+		CXXRTL_ALWAYS_INLINE
 		value<NewBits> operator()(const value<Bits> &val) {
 			return val.template trunc<NewBits>();
 		}
@@ -210,6 +234,7 @@ struct value : public expr_base<value<Bits>> {
 
 	template<size_t NewBits, typename = void>
 	struct sext_cast {
+		CXXRTL_ALWAYS_INLINE
 		value<NewBits> operator()(const value<Bits> &val) {
 			return val.template sext<NewBits>();
 		}
@@ -217,17 +242,20 @@ struct value : public expr_base<value<Bits>> {
 
 	template<size_t NewBits>
 	struct sext_cast<NewBits, typename std::enable_if<(NewBits < Bits)>::type> {
+		CXXRTL_ALWAYS_INLINE
 		value<NewBits> operator()(const value<Bits> &val) {
 			return val.template trunc<NewBits>();
 		}
 	};
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> zcast() const {
 		return zext_cast<NewBits>()(*this);
 	}
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> scast() const {
 		return sext_cast<NewBits>()(*this);
 	}
@@ -349,10 +377,12 @@ struct value : public expr_base<value<Bits>> {
 				: data[chunks - 1 - n] << (chunk::bits - shift_bits);
 		}
 		if (Signed && is_neg()) {
-			for (size_t n = chunks - shift_chunks; n < chunks; n++)
+			size_t top_chunk_idx  = (Bits - shift_bits) / chunk::bits;
+			size_t top_chunk_bits = (Bits - shift_bits) % chunk::bits;
+			for (size_t n = top_chunk_idx + 1; n < chunks; n++)
 				result.data[n] = chunk::mask;
 			if (shift_bits != 0)
-				result.data[chunks - shift_chunks] |= chunk::mask << (chunk::bits - shift_bits);
+				result.data[top_chunk_idx] |= chunk::mask << top_chunk_bits;
 		}
 		return result;
 	}
@@ -425,6 +455,24 @@ struct value : public expr_base<value<Bits>> {
 		bool overflow = (is_neg() == !other.is_neg()) && (is_neg() != result.is_neg());
 		return result.is_neg() ^ overflow; // a.scmp(b) ≡ a s< b
 	}
+
+	template<size_t ResultBits>
+	value<ResultBits> mul(const value<Bits> &other) const {
+		value<ResultBits> result;
+		wide_chunk_t wide_result[result.chunks + 1] = {};
+		for (size_t n = 0; n < chunks; n++) {
+			for (size_t m = 0; m < chunks && n + m < result.chunks; m++) {
+				wide_result[n + m] += wide_chunk_t(data[n]) * wide_chunk_t(other.data[m]);
+				wide_result[n + m + 1] += wide_result[n + m] >> chunk::bits;
+				wide_result[n + m] &= chunk::mask;
+			}
+		}
+		for (size_t n = 0; n < result.chunks; n++) {
+			result.data[n] = wide_result[n];
+		}
+		result.data[result.chunks - 1] &= result.msb_mask;
+		return result;
+	}
 };
 
 // Expression template for a slice, usable as lvalue or rvalue, and composable with other expression templates here.
@@ -439,12 +487,14 @@ struct slice_expr : public expr_base<slice_expr<T, Stop, Start>> {
 	slice_expr(T &expr) : expr(expr) {}
 	slice_expr(const slice_expr<T, Stop, Start> &) = delete;
 
+	CXXRTL_ALWAYS_INLINE
 	operator value<bits>() const {
 		return static_cast<const value<T::bits> &>(expr)
 			.template rtrunc<T::bits - Start>()
 			.template trunc<bits>();
 	}
 
+	CXXRTL_ALWAYS_INLINE
 	slice_expr<T, Stop, Start> &operator=(const value<bits> &rhs) {
 		// Generic partial assignment implemented using a read-modify-write operation on the sliced expression.
 		expr = static_cast<const value<T::bits> &>(expr)
@@ -453,6 +503,7 @@ struct slice_expr : public expr_base<slice_expr<T, Stop, Start>> {
 	}
 
 	// A helper that forces the cast to value<>, which allows deduction to work.
+	CXXRTL_ALWAYS_INLINE
 	value<bits> val() const {
 		return static_cast<const value<bits> &>(*this);
 	}
@@ -469,6 +520,7 @@ struct concat_expr : public expr_base<concat_expr<T, U>> {
 	concat_expr(T &ms_expr, U &ls_expr) : ms_expr(ms_expr), ls_expr(ls_expr) {}
 	concat_expr(const concat_expr<T, U> &) = delete;
 
+	CXXRTL_ALWAYS_INLINE
 	operator value<bits>() const {
 		value<bits> ms_shifted = static_cast<const value<T::bits> &>(ms_expr)
 			.template rzext<bits>();
@@ -477,6 +529,7 @@ struct concat_expr : public expr_base<concat_expr<T, U>> {
 		return ms_shifted.bit_or(ls_extended);
 	}
 
+	CXXRTL_ALWAYS_INLINE
 	concat_expr<T, U> &operator=(const value<bits> &rhs) {
 		ms_expr = rhs.template rtrunc<T::bits>();
 		ls_expr = rhs.template trunc<U::bits>();
@@ -484,6 +537,7 @@ struct concat_expr : public expr_base<concat_expr<T, U>> {
 	}
 
 	// A helper that forces the cast to value<>, which allows deduction to work.
+	CXXRTL_ALWAYS_INLINE
 	value<bits> val() const {
 		return static_cast<const value<bits> &>(*this);
 	}
@@ -508,21 +562,25 @@ struct concat_expr : public expr_base<concat_expr<T, U>> {
 template<class T>
 struct expr_base {
 	template<size_t Stop, size_t Start = Stop>
+	CXXRTL_ALWAYS_INLINE
 	slice_expr<const T, Stop, Start> slice() const {
 		return {*static_cast<const T *>(this)};
 	}
 
 	template<size_t Stop, size_t Start = Stop>
+	CXXRTL_ALWAYS_INLINE
 	slice_expr<T, Stop, Start> slice() {
 		return {*static_cast<T *>(this)};
 	}
 
 	template<class U>
+	CXXRTL_ALWAYS_INLINE
 	concat_expr<const T, typename std::remove_reference<const U>::type> concat(const U &other) const {
 		return {*static_cast<const T *>(this), other};
 	}
 
 	template<class U>
+	CXXRTL_ALWAYS_INLINE
 	concat_expr<T, typename std::remove_reference<U>::type> concat(U &&other) {
 		return {*static_cast<T *>(this), other};
 	}
@@ -734,76 +792,119 @@ struct debug_item : ::cxxrtl_object {
 	debug_item(const ::cxxrtl_object &object) : cxxrtl_object(object) {}
 
 	template<size_t Bits>
-	debug_item(value<Bits> &item) {
+	debug_item(value<Bits> &item, size_t lsb_offset = 0) {
 		static_assert(sizeof(item) == value<Bits>::chunks * sizeof(chunk_t),
 		              "value<Bits> is not compatible with C layout");
-		type  = VALUE;
-		width = Bits;
-		depth = 1;
-		curr  = item.data;
-		next  = item.data;
+		type    = VALUE;
+		width   = Bits;
+		lsb_at  = lsb_offset;
+		depth   = 1;
+		zero_at = 0;
+		curr    = item.data;
+		next    = item.data;
 	}
 
 	template<size_t Bits>
-	debug_item(const value<Bits> &item) {
+	debug_item(const value<Bits> &item, size_t lsb_offset = 0) {
 		static_assert(sizeof(item) == value<Bits>::chunks * sizeof(chunk_t),
 		              "value<Bits> is not compatible with C layout");
-		type  = VALUE;
-		width = Bits;
-		depth = 1;
-		curr  = const_cast<chunk_t*>(item.data);
-		next  = nullptr;
+		type    = VALUE;
+		width   = Bits;
+		lsb_at  = lsb_offset;
+		depth   = 1;
+		zero_at = 0;
+		curr    = const_cast<chunk_t*>(item.data);
+		next    = nullptr;
 	}
 
 	template<size_t Bits>
-	debug_item(wire<Bits> &item) {
+	debug_item(wire<Bits> &item, size_t lsb_offset = 0) {
 		static_assert(sizeof(item.curr) == value<Bits>::chunks * sizeof(chunk_t) &&
 		              sizeof(item.next) == value<Bits>::chunks * sizeof(chunk_t),
 		              "wire<Bits> is not compatible with C layout");
-		type  = WIRE;
-		width = Bits;
-		depth = 1;
-		curr  = item.curr.data;
-		next  = item.next.data;
+		type    = WIRE;
+		width   = Bits;
+		lsb_at  = lsb_offset;
+		depth   = 1;
+		zero_at = 0;
+		curr    = item.curr.data;
+		next    = item.next.data;
 	}
 
 	template<size_t Width>
-	debug_item(memory<Width> &item) {
+	debug_item(memory<Width> &item, size_t zero_offset = 0) {
 		static_assert(sizeof(item.data[0]) == value<Width>::chunks * sizeof(chunk_t),
 		              "memory<Width> is not compatible with C layout");
-		type  = MEMORY;
-		width = Width;
-		depth = item.data.size();
-		curr  = item.data.empty() ? nullptr : item.data[0].data;
-		next  = nullptr;
+		type    = MEMORY;
+		width   = Width;
+		lsb_at  = 0;
+		depth   = item.data.size();
+		zero_at = zero_offset;
+		curr    = item.data.empty() ? nullptr : item.data[0].data;
+		next    = nullptr;
 	}
 
 	template<size_t Bits>
-	debug_item(debug_alias, const value<Bits> &item) {
+	debug_item(debug_alias, const value<Bits> &item, size_t lsb_offset = 0) {
 		static_assert(sizeof(item) == value<Bits>::chunks * sizeof(chunk_t),
 		              "value<Bits> is not compatible with C layout");
-		type  = ALIAS;
-		width = Bits;
-		depth = 1;
-		curr  = const_cast<chunk_t*>(item.data);
-		next  = nullptr;
+		type    = ALIAS;
+		width   = Bits;
+		lsb_at  = lsb_offset;
+		depth   = 1;
+		zero_at = 0;
+		curr    = const_cast<chunk_t*>(item.data);
+		next    = nullptr;
 	}
 
 	template<size_t Bits>
-	debug_item(debug_alias, const wire<Bits> &item) {
+	debug_item(debug_alias, const wire<Bits> &item, size_t lsb_offset = 0) {
 		static_assert(sizeof(item.curr) == value<Bits>::chunks * sizeof(chunk_t) &&
 		              sizeof(item.next) == value<Bits>::chunks * sizeof(chunk_t),
 		              "wire<Bits> is not compatible with C layout");
-		type  = ALIAS;
-		width = Bits;
-		depth = 1;
-		curr  = const_cast<chunk_t*>(item.curr.data);
-		next  = nullptr;
+		type    = ALIAS;
+		width   = Bits;
+		lsb_at  = lsb_offset;
+		depth   = 1;
+		zero_at = 0;
+		curr    = const_cast<chunk_t*>(item.curr.data);
+		next    = nullptr;
 	}
 };
 static_assert(std::is_standard_layout<debug_item>::value, "debug_item is not compatible with C layout");
 
-typedef std::map<std::string, debug_item> debug_items;
+struct debug_items {
+	std::map<std::string, std::vector<debug_item>> table;
+
+	void add(const std::string &name, debug_item &&item) {
+		std::vector<debug_item> &parts = table[name];
+		parts.emplace_back(item);
+		std::sort(parts.begin(), parts.end(),
+			[](const debug_item &a, const debug_item &b) {
+				return a.lsb_at < b.lsb_at;
+			});
+	}
+
+	size_t count(const std::string &name) const {
+		if (table.count(name) == 0)
+			return 0;
+		return table.at(name).size();
+	}
+
+	const std::vector<debug_item> &parts_at(const std::string &name) const {
+		return table.at(name);
+	}
+
+	const debug_item &at(const std::string &name) const {
+		const std::vector<debug_item> &parts = table.at(name);
+		assert(parts.size() == 1);
+		return parts.at(0);
+	}
+
+	const debug_item &operator [](const std::string &name) const {
+		return at(name);
+	}
+};
 
 struct module {
 	module() {}
@@ -851,271 +952,322 @@ using namespace cxxrtl;
 
 // std::max isn't constexpr until C++14 for no particular reason (it's an oversight), so we define our own.
 template<class T>
+CXXRTL_ALWAYS_INLINE
 constexpr T max(const T &a, const T &b) {
 	return a > b ? a : b;
 }
 
 // Logic operations
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> logic_not(const value<BitsA> &a) {
 	return value<BitsY> { a ? 0u : 1u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> logic_and(const value<BitsA> &a, const value<BitsB> &b) {
 	return value<BitsY> { (bool(a) & bool(b)) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> logic_or(const value<BitsA> &a, const value<BitsB> &b) {
 	return value<BitsY> { (bool(a) | bool(b)) ? 1u : 0u };
 }
 
 // Reduction operations
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> reduce_and(const value<BitsA> &a) {
 	return value<BitsY> { a.bit_not().is_zero() ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> reduce_or(const value<BitsA> &a) {
 	return value<BitsY> { a ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> reduce_xor(const value<BitsA> &a) {
 	return value<BitsY> { (a.ctpop() % 2) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> reduce_xnor(const value<BitsA> &a) {
 	return value<BitsY> { (a.ctpop() % 2) ? 0u : 1u };
 }
 
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> reduce_bool(const value<BitsA> &a) {
 	return value<BitsY> { a ? 1u : 0u };
 }
 
 // Bitwise operations
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> not_u(const value<BitsA> &a) {
 	return a.template zcast<BitsY>().bit_not();
 }
 
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> not_s(const value<BitsA> &a) {
 	return a.template scast<BitsY>().bit_not();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> and_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().bit_and(b.template zcast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> and_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().bit_and(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> or_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().bit_or(b.template zcast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> or_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().bit_or(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> xor_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().bit_xor(b.template zcast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> xor_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().bit_xor(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> xnor_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().bit_xor(b.template zcast<BitsY>()).bit_not();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> xnor_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().bit_xor(b.template scast<BitsY>()).bit_not();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shl_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().template shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shl_su(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().template shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sshl_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().template shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sshl_su(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().template shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shr_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template shr(b).template zcast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shr_su(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template shr(b).template scast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sshr_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template shr(b).template zcast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sshr_su(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template sshr(b).template scast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shift_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return shr_uu<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shift_su(const value<BitsA> &a, const value<BitsB> &b) {
 	return shr_su<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shift_us(const value<BitsA> &a, const value<BitsB> &b) {
 	return b.is_neg() ? shl_uu<BitsY>(a, b.template sext<BitsB + 1>().neg()) : shr_uu<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shift_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return b.is_neg() ? shl_su<BitsY>(a, b.template sext<BitsB + 1>().neg()) : shr_su<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shiftx_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return shift_uu<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shiftx_su(const value<BitsA> &a, const value<BitsB> &b) {
 	return shift_su<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shiftx_us(const value<BitsA> &a, const value<BitsB> &b) {
 	return shift_us<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shiftx_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return shift_ss<BitsY>(a, b);
 }
 
 // Comparison operations
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> eq_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY>{ a.template zext<BitsExt>() == b.template zext<BitsExt>() ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> eq_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY>{ a.template sext<BitsExt>() == b.template sext<BitsExt>() ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> ne_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY>{ a.template zext<BitsExt>() != b.template zext<BitsExt>() ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> ne_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY>{ a.template sext<BitsExt>() != b.template sext<BitsExt>() ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> eqx_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return eq_uu<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> eqx_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return eq_ss<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> nex_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return ne_uu<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> nex_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return ne_ss<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> gt_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { b.template zext<BitsExt>().ucmp(a.template zext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> gt_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { b.template sext<BitsExt>().scmp(a.template sext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> ge_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { !a.template zext<BitsExt>().ucmp(b.template zext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> ge_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { !a.template sext<BitsExt>().scmp(b.template sext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> lt_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { a.template zext<BitsExt>().ucmp(b.template zext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> lt_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { a.template sext<BitsExt>().scmp(b.template sext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> le_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { !b.template zext<BitsExt>().ucmp(a.template zext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> le_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { !b.template sext<BitsExt>().scmp(a.template sext<BitsExt>()) ? 1u : 0u };
@@ -1123,71 +1275,68 @@ value<BitsY> le_ss(const value<BitsA> &a, const value<BitsB> &b) {
 
 // Arithmetic operations
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> pos_u(const value<BitsA> &a) {
 	return a.template zcast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> pos_s(const value<BitsA> &a) {
 	return a.template scast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> neg_u(const value<BitsA> &a) {
 	return a.template zcast<BitsY>().neg();
 }
 
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> neg_s(const value<BitsA> &a) {
 	return a.template scast<BitsY>().neg();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> add_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().add(b.template zcast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> add_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().add(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sub_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().sub(b.template zcast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sub_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().sub(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> mul_uu(const value<BitsA> &a, const value<BitsB> &b) {
-	value<BitsY> product;
-	value<BitsY> multiplicand = a.template zcast<BitsY>();
-	const value<BitsB> &multiplier = b;
-	uint32_t multiplicand_shift = 0;
-	for (size_t step = 0; step < BitsB; step++) {
-		if (multiplier.bit(step)) {
-			multiplicand = multiplicand.shl(value<32> { multiplicand_shift });
-			product = product.add(multiplicand);
-			multiplicand_shift = 0;
-		}
-		multiplicand_shift++;
-	}
-	return product;
+	constexpr size_t BitsM = BitsA >= BitsB ? BitsA : BitsB;
+	return a.template zcast<BitsM>().template mul<BitsY>(b.template zcast<BitsM>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> mul_ss(const value<BitsA> &a, const value<BitsB> &b) {
-	value<BitsB + 1> ub = b.template sext<BitsB + 1>();
-	if (ub.is_neg()) ub = ub.neg();
-	value<BitsY> y = mul_uu<BitsY>(a.template scast<BitsY>(), ub);
-	return b.is_neg() ? y.neg() : y;
+	return a.template scast<BitsY>().template mul<BitsY>(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 std::pair<value<BitsY>, value<BitsY>> divmod_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t Bits = max(BitsY, max(BitsA, BitsB));
 	value<Bits> quotient;
@@ -1209,6 +1358,7 @@ std::pair<value<BitsY>, value<BitsY>> divmod_uu(const value<BitsA> &a, const val
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 std::pair<value<BitsY>, value<BitsY>> divmod_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	value<BitsA + 1> ua = a.template sext<BitsA + 1>();
 	value<BitsB + 1> ub = b.template sext<BitsB + 1>();
@@ -1222,21 +1372,25 @@ std::pair<value<BitsY>, value<BitsY>> divmod_ss(const value<BitsA> &a, const val
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> div_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return divmod_uu<BitsY>(a, b).first;
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> div_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return divmod_ss<BitsY>(a, b).first;
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> mod_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return divmod_uu<BitsY>(a, b).second;
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> mod_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return divmod_ss<BitsY>(a, b).second;
 }
diff --git a/backends/cxxrtl/cxxrtl_backend.cc b/backends/cxxrtl/cxxrtl_backend.cc
index 052053c52..72190d99a 100644
--- a/backends/cxxrtl/cxxrtl_backend.cc
+++ b/backends/cxxrtl/cxxrtl_backend.cc
@@ -171,11 +171,6 @@ struct Scheduler {
 	}
 };
 
-bool is_input_wire(const RTLIL::Wire *wire)
-{
-	return wire->port_input && !wire->port_output;
-}
-
 bool is_unary_cell(RTLIL::IdString type)
 {
 	return type.in(
@@ -202,7 +197,7 @@ bool is_extending_cell(RTLIL::IdString type)
 bool is_elidable_cell(RTLIL::IdString type)
 {
 	return is_unary_cell(type) || is_binary_cell(type) || type.in(
-		ID($mux), ID($concat), ID($slice));
+		ID($mux), ID($concat), ID($slice), ID($pmux));
 }
 
 bool is_sync_ff_cell(RTLIL::IdString type)
@@ -804,7 +799,7 @@ struct CxxrtlWorker {
 					default:
 						log_assert(false);
 				}
-			} else if (unbuffered_wires[chunk.wire] || is_input_wire(chunk.wire)) {
+			} else if (unbuffered_wires[chunk.wire]) {
 				f << mangle(chunk.wire);
 			} else {
 				f << mangle(chunk.wire) << (is_lhs ? ".next" : ".curr");
@@ -947,6 +942,21 @@ struct CxxrtlWorker {
 			f << " : ";
 			dump_sigspec_rhs(cell->getPort(ID::A));
 			f << ")";
+		// Parallel (one-hot) muxes
+		} else if (cell->type == ID($pmux)) {
+			int width = cell->getParam(ID::WIDTH).as_int();
+			int s_width = cell->getParam(ID::S_WIDTH).as_int();
+			for (int part = 0; part < s_width; part++) {
+				f << "(";
+				dump_sigspec_rhs(cell->getPort(ID::S).extract(part));
+				f << " ? ";
+				dump_sigspec_rhs(cell->getPort(ID::B).extract(part * width, width));
+				f << " : ";
+			}
+			dump_sigspec_rhs(cell->getPort(ID::A));
+			for (int part = 0; part < s_width; part++) {
+				f << ")";
+			}
 		// Concats
 		} else if (cell->type == ID($concat)) {
 			dump_sigspec_rhs(cell->getPort(ID::B));
@@ -1013,35 +1023,6 @@ struct CxxrtlWorker {
 			f << " = ";
 			dump_cell_elided(cell);
 			f << ";\n";
-		// Parallel (one-hot) muxes
-		} else if (cell->type == ID($pmux)) {
-			int width = cell->getParam(ID::WIDTH).as_int();
-			int s_width = cell->getParam(ID::S_WIDTH).as_int();
-			bool first = true;
-			for (int part = 0; part < s_width; part++) {
-				f << (first ? indent : " else ");
-				first = false;
-				f << "if (";
-				dump_sigspec_rhs(cell->getPort(ID::S).extract(part));
-				f << ") {\n";
-				inc_indent();
-					f << indent;
-					dump_sigspec_lhs(cell->getPort(ID::Y));
-					f << " = ";
-					dump_sigspec_rhs(cell->getPort(ID::B).extract(part * width, width));
-					f << ";\n";
-				dec_indent();
-				f << indent << "}";
-			}
-			f << " else {\n";
-			inc_indent();
-				f << indent;
-				dump_sigspec_lhs(cell->getPort(ID::Y));
-				f << " = ";
-				dump_sigspec_rhs(cell->getPort(ID::A));
-				f << ";\n";
-			dec_indent();
-			f << indent << "}\n";
 		// Flip-flops
 		} else if (is_ff_cell(cell->type)) {
 			if (cell->hasPort(ID::CLK) && cell->getPort(ID::CLK).is_wire()) {
@@ -1153,31 +1134,33 @@ struct CxxrtlWorker {
 				f << indent << "if(" << valid_index_temp << ".valid) {\n";
 				inc_indent();
 					if (writable_memories[memory]) {
-						std::string addr_temp = fresh_temporary();
-						f << indent << "const value<" << cell->getPort(ID::ADDR).size() << "> &" << addr_temp << " = ";
-						dump_sigspec_rhs(cell->getPort(ID::ADDR));
-						f << ";\n";
 						std::string lhs_temp = fresh_temporary();
 						f << indent << "value<" << memory->width << "> " << lhs_temp << " = "
 						            << mangle(memory) << "[" << valid_index_temp << ".index];\n";
 						std::vector<const RTLIL::Cell*> memwr_cells(transparent_for[cell].begin(), transparent_for[cell].end());
-						std::sort(memwr_cells.begin(), memwr_cells.end(),
-							[](const RTLIL::Cell *a, const RTLIL::Cell *b) {
-								return a->getParam(ID::PRIORITY).as_int() < b->getParam(ID::PRIORITY).as_int();
-							});
-						for (auto memwr_cell : memwr_cells) {
-							f << indent << "if (" << addr_temp << " == ";
-							dump_sigspec_rhs(memwr_cell->getPort(ID::ADDR));
-							f << ") {\n";
-							inc_indent();
-								f << indent << lhs_temp << " = " << lhs_temp;
-								f << ".update(";
-								dump_sigspec_rhs(memwr_cell->getPort(ID::DATA));
-								f << ", ";
-								dump_sigspec_rhs(memwr_cell->getPort(ID::EN));
-								f << ");\n";
-							dec_indent();
-							f << indent << "}\n";
+						if (!memwr_cells.empty()) {
+							std::string addr_temp = fresh_temporary();
+							f << indent << "const value<" << cell->getPort(ID::ADDR).size() << "> &" << addr_temp << " = ";
+							dump_sigspec_rhs(cell->getPort(ID::ADDR));
+							f << ";\n";
+							std::sort(memwr_cells.begin(), memwr_cells.end(),
+								[](const RTLIL::Cell *a, const RTLIL::Cell *b) {
+									return a->getParam(ID::PRIORITY).as_int() < b->getParam(ID::PRIORITY).as_int();
+								});
+							for (auto memwr_cell : memwr_cells) {
+								f << indent << "if (" << addr_temp << " == ";
+								dump_sigspec_rhs(memwr_cell->getPort(ID::ADDR));
+								f << ") {\n";
+								inc_indent();
+									f << indent << lhs_temp << " = " << lhs_temp;
+									f << ".update(";
+									dump_sigspec_rhs(memwr_cell->getPort(ID::DATA));
+									f << ", ";
+									dump_sigspec_rhs(memwr_cell->getPort(ID::EN));
+									f << ");\n";
+								dec_indent();
+								f << indent << "}\n";
+							}
 						}
 						f << indent;
 						dump_sigspec_lhs(cell->getPort(ID::DATA));
@@ -1440,12 +1423,11 @@ struct CxxrtlWorker {
 		if (elided_wires.count(wire))
 			return;
 
-		if (unbuffered_wires[wire]) {
-			if (localized_wires[wire] == is_local_context) {
-				dump_attrs(wire);
-				f << indent << "value<" << wire->width << "> " << mangle(wire) << ";\n";
-			}
-		} else if (!is_local_context) {
+		if (localized_wires[wire] && is_local_context) {
+			dump_attrs(wire);
+			f << indent << "value<" << wire->width << "> " << mangle(wire) << ";\n";
+		}
+		if (!localized_wires[wire] && !is_local_context) {
 			std::string width;
 			if (wire->module->has_attribute(ID(cxxrtl_blackbox)) && wire->has_attribute(ID(cxxrtl_width))) {
 				width = wire->get_string_attribute(ID(cxxrtl_width));
@@ -1454,14 +1436,21 @@ struct CxxrtlWorker {
 			}
 
 			dump_attrs(wire);
-			f << indent << (is_input_wire(wire) ? "value" : "wire") << "<" << width << "> " << mangle(wire);
+			f << indent;
+			if (wire->port_input && wire->port_output)
+				f << "/*inout*/ ";
+			else if (wire->port_input)
+				f << "/*input*/ ";
+			else if (wire->port_output)
+				f << "/*output*/ ";
+			f << (unbuffered_wires[wire] ? "value" : "wire") << "<" << width << "> " << mangle(wire);
 			if (wire->has_attribute(ID::init)) {
 				f << " ";
 				dump_const_init(wire->attributes.at(ID::init));
 			}
 			f << ";\n";
 			if (edge_wires[wire]) {
-				if (is_input_wire(wire)) {
+				if (unbuffered_wires[wire]) {
 					f << indent << "value<" << width << "> prev_" << mangle(wire);
 					if (wire->has_attribute(ID::init)) {
 						f << " ";
@@ -1472,7 +1461,7 @@ struct CxxrtlWorker {
 				for (auto edge_type : edge_types) {
 					if (edge_type.first.wire == wire) {
 						std::string prev, next;
-						if (is_input_wire(wire)) {
+						if (unbuffered_wires[wire]) {
 							prev = "prev_" + mangle(edge_type.first.wire);
 							next =           mangle(edge_type.first.wire);
 						} else {
@@ -1595,9 +1584,9 @@ struct CxxrtlWorker {
 		inc_indent();
 			f << indent << "bool changed = false;\n";
 			for (auto wire : module->wires()) {
-				if (elided_wires.count(wire) || unbuffered_wires.count(wire))
+				if (elided_wires.count(wire))
 					continue;
-				if (is_input_wire(wire)) {
+				if (unbuffered_wires[wire]) {
 					if (edge_wires[wire])
 						f << indent << "prev_" << mangle(wire) << " = " << mangle(wire) << ";\n";
 					continue;
@@ -1634,41 +1623,49 @@ struct CxxrtlWorker {
 			for (auto wire : module->wires()) {
 				if (wire->name[0] != '\\')
 					continue;
+				if (module->get_bool_attribute(ID(cxxrtl_blackbox)) && (wire->port_id == 0))
+					continue;
 				count_public_wires++;
 				if (debug_const_wires.count(wire)) {
 					// Wire tied to a constant
 					f << indent << "static const value<" << wire->width << "> const_" << mangle(wire) << " = ";
 					dump_const(debug_const_wires[wire]);
 					f << ";\n";
-					f << indent << "items.emplace(path + " << escape_cxx_string(get_hdl_name(wire));
-					f << ", debug_item(const_" << mangle(wire) << "));\n";
+					f << indent << "items.add(path + " << escape_cxx_string(get_hdl_name(wire));
+					f << ", debug_item(const_" << mangle(wire) << ", ";
+					f << wire->start_offset << "));\n";
 					count_const_wires++;
 				} else if (debug_alias_wires.count(wire)) {
 					// Alias of a member wire
-					f << indent << "items.emplace(path + " << escape_cxx_string(get_hdl_name(wire));
-					f << ", debug_item(debug_alias(), " << mangle(debug_alias_wires[wire]) << "));\n";
+					f << indent << "items.add(path + " << escape_cxx_string(get_hdl_name(wire));
+					f << ", debug_item(debug_alias(), " << mangle(debug_alias_wires[wire]) << ", ";
+					f << wire->start_offset << "));\n";
 					count_alias_wires++;
 				} else if (!localized_wires.count(wire)) {
 					// Member wire
-					f << indent << "items.emplace(path + " << escape_cxx_string(get_hdl_name(wire));
-					f << ", debug_item(" << mangle(wire) << "));\n";
+					f << indent << "items.add(path + " << escape_cxx_string(get_hdl_name(wire));
+					f << ", debug_item(" << mangle(wire) << ", ";
+					f << wire->start_offset << "));\n";
 					count_member_wires++;
 				} else {
 					count_skipped_wires++;
 				}
 			}
-			for (auto &memory_it : module->memories) {
-				if (memory_it.first[0] != '\\')
-					continue;
-				f << indent << "items.emplace(path + " << escape_cxx_string(get_hdl_name(memory_it.second));
-				f << ", debug_item(" << mangle(memory_it.second) << "));\n";
-			}
-			for (auto cell : module->cells()) {
-				if (is_internal_cell(cell->type))
-					continue;
-				const char *access = is_cxxrtl_blackbox_cell(cell) ? "->" : ".";
-				f << indent << mangle(cell) << access << "debug_info(items, ";
-				f << "path + " << escape_cxx_string(get_hdl_name(cell) + ' ') << ");\n";
+			if (!module->get_bool_attribute(ID(cxxrtl_blackbox))) {
+				for (auto &memory_it : module->memories) {
+					if (memory_it.first[0] != '\\')
+						continue;
+					f << indent << "items.add(path + " << escape_cxx_string(get_hdl_name(memory_it.second));
+					f << ", debug_item(" << mangle(memory_it.second) << ", ";
+					f << memory_it.second->start_offset << "));\n";
+				}
+				for (auto cell : module->cells()) {
+					if (is_internal_cell(cell->type))
+						continue;
+					const char *access = is_cxxrtl_blackbox_cell(cell) ? "->" : ".";
+					f << indent << mangle(cell) << access << "debug_info(items, ";
+					f << "path + " << escape_cxx_string(get_hdl_name(cell) + ' ') << ");\n";
+				}
 			}
 		dec_indent();
 
@@ -1970,6 +1967,8 @@ struct CxxrtlWorker {
 			if (module->get_bool_attribute(ID(cxxrtl_blackbox))) {
 				for (auto port : module->ports) {
 					RTLIL::Wire *wire = module->wire(port);
+					if (wire->port_input && !wire->port_output)
+						unbuffered_wires.insert(wire);
 					if (wire->has_attribute(ID(cxxrtl_edge))) {
 						RTLIL::Const edge_attr = wire->attributes[ID(cxxrtl_edge)];
 						if (!(edge_attr.flags & RTLIL::CONST_FLAG_STRING) || (int)edge_attr.decode_string().size() != GetSize(wire))
@@ -2158,13 +2157,14 @@ struct CxxrtlWorker {
 
 			for (auto wire : module->wires()) {
 				if (feedback_wires[wire]) continue;
-				if (wire->port_id != 0) continue;
-				if (wire->get_bool_attribute(ID::keep)) continue;
+				if (wire->port_output && !module->get_bool_attribute(ID::top)) continue;
 				if (wire->name.begins_with("$") && !unbuffer_internal) continue;
 				if (wire->name.begins_with("\\") && !unbuffer_public) continue;
-				if (edge_wires[wire]) continue;
 				if (flow.wire_sync_defs.count(wire) > 0) continue;
 				unbuffered_wires.insert(wire);
+				if (edge_wires[wire]) continue;
+				if (wire->get_bool_attribute(ID::keep)) continue;
+				if (wire->port_input || wire->port_output) continue;
 				if (wire->name.begins_with("$") && !localize_internal) continue;
 				if (wire->name.begins_with("\\") && !localize_public) continue;
 				localized_wires.insert(wire);
diff --git a/backends/cxxrtl/cxxrtl_capi.cc b/backends/cxxrtl/cxxrtl_capi.cc
index 489d72da5..e0566e152 100644
--- a/backends/cxxrtl/cxxrtl_capi.cc
+++ b/backends/cxxrtl/cxxrtl_capi.cc
@@ -47,14 +47,17 @@ size_t cxxrtl_step(cxxrtl_handle handle) {
 	return handle->module->step();
 }
 
-cxxrtl_object *cxxrtl_get(cxxrtl_handle handle, const char *name) {
-	if (handle->objects.count(name) > 0)
-		return static_cast<cxxrtl_object*>(&handle->objects.at(name));
-	return nullptr;
+struct cxxrtl_object *cxxrtl_get_parts(cxxrtl_handle handle, const char *name, size_t *parts) {
+	auto it = handle->objects.table.find(name);
+	if (it == handle->objects.table.end())
+		return nullptr;
+	*parts = it->second.size();
+	return static_cast<cxxrtl_object*>(&it->second[0]);
 }
 
 void cxxrtl_enum(cxxrtl_handle handle, void *data,
-                 void (*callback)(void *data, const char *name, cxxrtl_object *object)) {
-	for (auto &it : handle->objects)
-		callback(data, it.first.c_str(), static_cast<cxxrtl_object*>(&it.second));
+                 void (*callback)(void *data, const char *name,
+                                  cxxrtl_object *object, size_t parts)) {
+	for (auto &it : handle->objects.table)
+		callback(data, it.first.c_str(), static_cast<cxxrtl_object*>(&it.second[0]), it.second.size());
 }
diff --git a/backends/cxxrtl/cxxrtl_capi.h b/backends/cxxrtl/cxxrtl_capi.h
index 8bd906ea4..599284898 100644
--- a/backends/cxxrtl/cxxrtl_capi.h
+++ b/backends/cxxrtl/cxxrtl_capi.h
@@ -26,6 +26,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <assert.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -113,9 +114,15 @@ struct cxxrtl_object {
 	// Width of the object in bits.
 	size_t width;
 
+	// Index of the least significant bit.
+	size_t lsb_at;
+
 	// Depth of the object. Only meaningful for memories; for other objects, always 1.
 	size_t depth;
 
+	// Index of the first word. Only meaningful for memories; for other objects, always 0;
+	size_t zero_at;
+
 	// Bits stored in the object, as 32-bit chunks, least significant bits first.
 	//
 	// The width is rounded up to a multiple of 32; the padding bits are always set to 0 by
@@ -140,17 +147,36 @@ struct cxxrtl_object {
 // the top-level module instantiates a module `foo`, which in turn contains a wire `bar`, the full
 // hierarchical name is `\foo \bar`.
 //
-// Returns the object if it was found, NULL otherwise. The returned value is valid until the design
-// is destroyed.
-struct cxxrtl_object *cxxrtl_get(cxxrtl_handle handle, const char *name);
+// The storage of a single abstract object may be split (usually with the `splitnets` pass) into
+// many physical parts, all of which correspond to the same hierarchical name. To handle such cases,
+// this function returns an array and writes its length to `parts`. The array is sorted by `lsb_at`.
+//
+// Returns the object parts if it was found, NULL otherwise. The returned parts are valid until
+// the design is destroyed.
+struct cxxrtl_object *cxxrtl_get_parts(cxxrtl_handle handle, const char *name, size_t *parts);
+
+// Retrieve description of a single part simulated object.
+//
+// This function is a shortcut for the most common use of `cxxrtl_get_parts`. It asserts that,
+// if the object exists, it consists of a single part. If assertions are disabled, it returns NULL
+// for multi-part objects.
+inline struct cxxrtl_object *cxxrtl_get(cxxrtl_handle handle, const char *name) {
+	size_t parts = 0;
+	struct cxxrtl_object *object = cxxrtl_get_parts(handle, name, &parts);
+	assert(object == NULL || parts == 1);
+	if (object == NULL || parts == 1)
+		return object;
+	return NULL;
+}
 
 // Enumerate simulated objects.
 //
 // For every object in the simulation, `callback` is called with the provided `data`, the full
-// hierarchical name of the object (see `cxxrtl_get` for details), and the object description.
+// hierarchical name of the object (see `cxxrtl_get` for details), and the object parts.
 // The provided `name` and `object` values are valid until the design is destroyed.
 void cxxrtl_enum(cxxrtl_handle handle, void *data,
-                 void (*callback)(void *data, const char *name, struct cxxrtl_object *object));
+                 void (*callback)(void *data, const char *name,
+                                  struct cxxrtl_object *object, size_t parts));
 
 #ifdef __cplusplus
 }
diff --git a/backends/cxxrtl/cxxrtl_vcd.h b/backends/cxxrtl/cxxrtl_vcd.h
index 4c2021e92..dbeabbaf2 100644
--- a/backends/cxxrtl/cxxrtl_vcd.h
+++ b/backends/cxxrtl/cxxrtl_vcd.h
@@ -66,11 +66,19 @@ class vcd_writer {
 		} while (ident != 0);
 	}
 
-	void emit_var(const variable &var, const std::string &type, const std::string &name) {
+	void emit_var(const variable &var, const std::string &type, const std::string &name,
+	              size_t lsb_at, bool multipart) {
 		assert(!streaming);
 		buffer += "$var " + type + " " + std::to_string(var.width) + " ";
 		emit_ident(var.ident);
-		buffer += " " + name + " $end\n";
+		buffer += " " + name;
+		if (multipart || name.back() == ']' || lsb_at != 0) {
+			if (var.width == 1)
+				buffer += " [" + std::to_string(lsb_at) + "]";
+			else
+				buffer += " [" + std::to_string(lsb_at + var.width - 1) + ":" + std::to_string(lsb_at) + "]";
+		}
+		buffer += " $end\n";
 	}
 
 	void emit_enddefinitions() {
@@ -155,7 +163,7 @@ public:
 		emit_timescale(number, unit);
 	}
 
-	void add(const std::string &hier_name, const debug_item &item) {
+	void add(const std::string &hier_name, const debug_item &item, bool multipart = false) {
 		std::vector<std::string> scope = split_hierarchy(hier_name);
 		std::string name = scope.back();
 		scope.pop_back();
@@ -164,17 +172,20 @@ public:
 		switch (item.type) {
 			// Not the best naming but oh well...
 			case debug_item::VALUE:
-				emit_var(register_variable(item.width, item.curr, /*constant=*/item.next == nullptr), "wire", name);
+				emit_var(register_variable(item.width, item.curr, /*constant=*/item.next == nullptr),
+				         "wire", name, item.lsb_at, multipart);
 				break;
 			case debug_item::WIRE:
-				emit_var(register_variable(item.width, item.curr), "reg", name);
+				emit_var(register_variable(item.width, item.curr),
+				         "reg", name, item.lsb_at, multipart);
 				break;
 			case debug_item::MEMORY: {
 				const size_t stride = (item.width + (sizeof(chunk_t) * 8 - 1)) / (sizeof(chunk_t) * 8);
 				for (size_t index = 0; index < item.depth; index++) {
 					chunk_t *nth_curr = &item.curr[stride * index];
 					std::string nth_name = name + '[' + std::to_string(index) + ']';
-					emit_var(register_variable(item.width, nth_curr), "reg", nth_name);
+					emit_var(register_variable(item.width, nth_curr),
+					         "reg", nth_name, item.lsb_at, multipart);
 				}
 				break;
 			}
@@ -183,7 +194,8 @@ public:
 				// can actually change, and must be tracked. In most cases the VCD identifier will be
 				// unified with the aliased reg, but we should handle the case where only the alias is
 				// added to the VCD writer, too.
-				emit_var(register_variable(item.width, item.curr), "wire", name);
+				emit_var(register_variable(item.width, item.curr),
+				         "wire", name, item.lsb_at, multipart);
 				break;
 		}
 	}
@@ -192,9 +204,10 @@ public:
 	void add(const debug_items &items, const Filter &filter) {
 		// `debug_items` is a map, so the items are already sorted in an order optimal for emitting
 		// VCD scope sections.
-		for (auto &it : items)
-			if (filter(it.first, it.second))
-				add(it.first, it.second);
+		for (auto &it : items.table)
+			for (auto &part : it.second)
+				if (filter(it.first, part))
+					add(it.first, part, it.second.size() > 1);
 	}
 
 	void add(const debug_items &items) {
diff --git a/frontends/verilog/preproc.cc b/frontends/verilog/preproc.cc
index 7905ea598..ea23139e2 100644
--- a/frontends/verilog/preproc.cc
+++ b/frontends/verilog/preproc.cc
@@ -591,7 +591,7 @@ read_define_args()
 
 		default:
 			// The only FSM states are 0-2 and we dealt with 2 at the start of the loop.
-			__builtin_unreachable();
+			log_assert(false);
 		}
 	}
 
diff --git a/frontends/verilog/verilog_parser.y b/frontends/verilog/verilog_parser.y
index b34a62248..15c231f3b 100644
--- a/frontends/verilog/verilog_parser.y
+++ b/frontends/verilog/verilog_parser.y
@@ -1481,7 +1481,7 @@ enum_name_decl:
 		delete $1;
 		SET_AST_NODE_LOC(node, @1, @1);
 		delete node->children[0];
-		node->children[0] = $2 ?: new AstNode(AST_NONE);
+		node->children[0] = $2 ? $2 : new AstNode(AST_NONE);
 		astbuf2->children.push_back(node);
 	}
 	;
diff --git a/kernel/log.h b/kernel/log.h
index 516744b50..63590c489 100644
--- a/kernel/log.h
+++ b/kernel/log.h
@@ -55,7 +55,9 @@
 #else
 #  include <sys/time.h>
 #  include <sys/resource.h>
-#  include <signal.h>
+#  if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+#    include <signal.h>
+#  endif
 #endif
 
 #if defined(_MSC_VER)
diff --git a/kernel/yosys.h b/kernel/yosys.h
index c922faf26..4fca39228 100644
--- a/kernel/yosys.h
+++ b/kernel/yosys.h
@@ -117,11 +117,11 @@ extern Tcl_Obj *Tcl_ObjSetVar2(Tcl_Interp *interp, Tcl_Obj *part1Ptr, Tcl_Obj *p
 #    define PATH_MAX MAX_PATH
 #    define isatty _isatty
 #    define fileno _fileno
-#  else
-//   mingw includes `wingdi.h` which defines a TRANSPARENT macro
-//   that conflicts with X(TRANSPARENT) entry in kernel/constids.inc
-#    undef TRANSPARENT
 #  endif
+
+// mingw and msvc include `wingdi.h` which defines a TRANSPARENT macro
+// that conflicts with X(TRANSPARENT) entry in kernel/constids.inc
+#  undef TRANSPARENT
 #endif
 
 #ifndef PATH_MAX
diff --git a/passes/techmap/extract_counter.cc b/passes/techmap/extract_counter.cc
index 68b338143..77a4bc0b6 100644
--- a/passes/techmap/extract_counter.cc
+++ b/passes/techmap/extract_counter.cc
@@ -795,11 +795,11 @@ struct ExtractCounterPass : public Pass {
 		pool<RTLIL::IdString> _parallel_cells;
 		CounterExtractionSettings settings
 		{
-			.parallel_cells = _parallel_cells,
-			.maxwidth = 64,
-			.minwidth = 2,
-			.allow_arst = true,
-			.allowed_dirs = 0,
+			_parallel_cells,    // parallel_cells
+			64,                 // maxwidth
+			2,                  // minwidth
+			true,               // allow_arst
+			0,                  // allowed_dirs
 		};
 
 		size_t argidx;
diff --git a/techlibs/ice40/cells_sim.v b/techlibs/ice40/cells_sim.v
index ad572c877..7ee809262 100644
--- a/techlibs/ice40/cells_sim.v
+++ b/techlibs/ice40/cells_sim.v
@@ -2508,7 +2508,7 @@ module SB_SPRAM256KA (
 
 	always @(negedge POWEROFF) begin
 		for (i = 0; i <= 16383; i = i+1)
-			mem[i] = 'bx;
+			mem[i] = 16'bx;
 	end
 
 	always @(posedge CLOCK, posedge off) begin
@@ -2516,17 +2516,17 @@ module SB_SPRAM256KA (
 			DATAOUT <= 0;
 		end else
 		if (STANDBY) begin
-			DATAOUT <= 'bx;
+			DATAOUT <= 16'bx;
 		end else
 		if (CHIPSELECT) begin
 			if (!WREN) begin
 				DATAOUT <= mem[ADDRESS];
 			end else begin
-				if (MASKWREN[0]) mem[ADDRESS][ 3: 0] = DATAIN[ 3: 0];
-				if (MASKWREN[1]) mem[ADDRESS][ 7: 4] = DATAIN[ 7: 4];
-				if (MASKWREN[2]) mem[ADDRESS][11: 8] = DATAIN[11: 8];
-				if (MASKWREN[3]) mem[ADDRESS][15:12] = DATAIN[15:12];
-				DATAOUT <= 'bx;
+				if (MASKWREN[0]) mem[ADDRESS][ 3: 0] <= DATAIN[ 3: 0];
+				if (MASKWREN[1]) mem[ADDRESS][ 7: 4] <= DATAIN[ 7: 4];
+				if (MASKWREN[2]) mem[ADDRESS][11: 8] <= DATAIN[11: 8];
+				if (MASKWREN[3]) mem[ADDRESS][15:12] <= DATAIN[15:12];
+				DATAOUT <= 16'bx;
 			end
 		end
 	end
diff --git a/techlibs/intel_alm/synth_intel_alm.cc b/techlibs/intel_alm/synth_intel_alm.cc
index 0f844961e..6508affc0 100644
--- a/techlibs/intel_alm/synth_intel_alm.cc
+++ b/techlibs/intel_alm/synth_intel_alm.cc
@@ -199,7 +199,7 @@ struct SynthIntelALMPass : public ScriptPass {
 		}
 
 		if (check_label("map_ffs")) {
-			run("dff2dffe -direct-match $_DFF_*");
+			run("dff2dffe");
 			// As mentioned in common/dff_sim.v, Intel flops power up to zero,
 			// so use `zinit` to add inverters where needed.
 			run("zinit");
diff --git a/tests/arch/intel_alm/dffs.ys b/tests/arch/intel_alm/dffs.ys
index cf29ad8e0..149b3121a 100644
--- a/tests/arch/intel_alm/dffs.ys
+++ b/tests/arch/intel_alm/dffs.ys
@@ -17,6 +17,5 @@ equiv_opt -async2sync -assert -map +/intel_alm/common/alm_sim.v -map +/intel_alm
 design -load postopt # load the post-opt design (otherwise equiv_opt loads the pre-opt design)
 cd dffe # Constrain all select calls below inside the top module
 select -assert-count 1 t:MISTRAL_FF
-select -assert-count 1 t:MISTRAL_ALUT3
 
-select -assert-none t:MISTRAL_FF t:MISTRAL_ALUT3 %% t:* %D
+select -assert-none t:MISTRAL_FF %% t:* %D
diff --git a/tests/arch/intel_alm/fsm.ys b/tests/arch/intel_alm/fsm.ys
index 8bb0ebab2..67965569b 100644
--- a/tests/arch/intel_alm/fsm.ys
+++ b/tests/arch/intel_alm/fsm.ys
@@ -13,6 +13,7 @@ cd fsm # Constrain all select calls below inside the top module
 
 select -assert-count 6 t:MISTRAL_FF
 select -assert-max 2 t:MISTRAL_ALUT2 # Clang returns 2, GCC returns 1
+select -assert-count 1 t:MISTRAL_ALUT3
 select -assert-count 5 t:MISTRAL_ALUT5
-select -assert-count 1 t:MISTRAL_ALUT6
-select -assert-none t:MISTRAL_FF t:MISTRAL_ALUT2 t:MISTRAL_ALUT5 t:MISTRAL_ALUT6 %% t:* %D
+select -assert-count 2 t:MISTRAL_ALUT6
+select -assert-none t:MISTRAL_FF t:MISTRAL_ALUT2 t:MISTRAL_ALUT3 t:MISTRAL_ALUT5 t:MISTRAL_ALUT6 %% t:* %D