From ece25a45d4b12f0436be238a13e622b58282036e Mon Sep 17 00:00:00 2001 From: whitequark Date: Sun, 13 Dec 2020 07:03:16 +0000 Subject: cxxrtl: implement debug information outlining. Aggressive wire localization and inlining is necessary for CXXRTL to achieve high performance. However, that comes with a cost: reduced debug information coverage. Previously, as a workaround, the `-Og` option could have been used to guarantee complete coverage, at a cost of a significant performance penalty. This commit introduces debug information outlining. The main eval() function is compiled with the user-specified optimization settings. In tandem, an auxiliary debug_eval() function, compiled from the same netlist, can be used to reconstruct the values of localized/inlined signals on demand. To the extent that it is possible, debug_eval() reuses the results of computations performed by eval(), only filling in the missing values. Benchmarking a representative design (Minerva SoC SRAM) shows that: * Switching from `-O4`/`-Og` to `-O6` reduces runtime by ~40%. * Switching from `-g1` to `-g2`, both used with `-O6`, increases compile time by ~25%. * Although `-g2` increases the resident size of generated modules, this has no effect on runtime. Because the impact of `-g2` is minimal and the benefits of having unconditional 100% debug information coverage (and the performance improvement as well) are major, this commit removes `-Og` and changes the defaults to `-O6 -g2`. We'll have our cake and eat it too! --- backends/cxxrtl/cxxrtl.h | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) (limited to 'backends/cxxrtl/cxxrtl.h') diff --git a/backends/cxxrtl/cxxrtl.h b/backends/cxxrtl/cxxrtl.h index d850fdba4..59393e415 100644 --- a/backends/cxxrtl/cxxrtl.h +++ b/backends/cxxrtl/cxxrtl.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -843,6 +844,9 @@ typedef std::map metadata_map; // Tag class to disambiguate values/wires and their aliases. struct debug_alias {}; +// Tag declaration to disambiguate values and debug outlines. +using debug_outline = ::_cxxrtl_outline; + // This structure is intended for consumption via foreign function interfaces, like Python's ctypes. // Because of this it uses a C-style layout that is easy to parse rather than more idiomatic C++. // @@ -851,10 +855,11 @@ struct debug_alias {}; struct debug_item : ::cxxrtl_object { // Object types. enum : uint32_t { - VALUE = CXXRTL_VALUE, - WIRE = CXXRTL_WIRE, - MEMORY = CXXRTL_MEMORY, - ALIAS = CXXRTL_ALIAS, + VALUE = CXXRTL_VALUE, + WIRE = CXXRTL_WIRE, + MEMORY = CXXRTL_MEMORY, + ALIAS = CXXRTL_ALIAS, + OUTLINE = CXXRTL_OUTLINE, }; // Object flags. @@ -881,6 +886,7 @@ struct debug_item : ::cxxrtl_object { zero_at = 0; curr = item.data; next = item.data; + outline = nullptr; } template @@ -895,6 +901,7 @@ struct debug_item : ::cxxrtl_object { zero_at = 0; curr = const_cast(item.data); next = nullptr; + outline = nullptr; } template @@ -910,6 +917,7 @@ struct debug_item : ::cxxrtl_object { zero_at = 0; curr = item.curr.data; next = item.next.data; + outline = nullptr; } template @@ -924,6 +932,7 @@ struct debug_item : ::cxxrtl_object { zero_at = zero_offset; curr = item.data.empty() ? nullptr : item.data[0].data; next = nullptr; + outline = nullptr; } template @@ -938,6 +947,7 @@ struct debug_item : ::cxxrtl_object { zero_at = 0; curr = const_cast(item.data); next = nullptr; + outline = nullptr; } template @@ -953,6 +963,22 @@ struct debug_item : ::cxxrtl_object { zero_at = 0; curr = const_cast(item.curr.data); next = nullptr; + outline = nullptr; + } + + template + debug_item(debug_outline &group, const value &item, size_t lsb_offset = 0) { + static_assert(sizeof(item) == value::chunks * sizeof(chunk_t), + "value is not compatible with C layout"); + type = OUTLINE; + flags = DRIVEN_COMB; + width = Bits; + lsb_at = lsb_offset; + depth = 1; + zero_at = 0; + curr = const_cast(item.data); + next = nullptr; + outline = &group; } }; static_assert(std::is_standard_layout::value, "debug_item is not compatible with C layout"); @@ -1029,11 +1055,16 @@ struct module { } // namespace cxxrtl -// Internal structure used to communicate with the implementation of the C interface. +// Internal structures used to communicate with the implementation of the C interface. + typedef struct _cxxrtl_toplevel { std::unique_ptr module; } *cxxrtl_toplevel; +typedef struct _cxxrtl_outline { + std::function eval; +} *cxxrtl_outline; + // Definitions of internal Yosys cells. Other than the functions in this namespace, CXXRTL is fully generic // and indepenent of Yosys implementation details. // -- cgit v1.2.3 From f75bc6c7aac92c7c1c8954ec7fe5325a94e6e491 Mon Sep 17 00:00:00 2001 From: whitequark Date: Sun, 13 Dec 2020 18:16:55 +0000 Subject: cxxrtl: disable optimization of debug_items(). Implementing outlining has greatly increased the amount of debug information in a typical build, and consequently exposed performance issues in C++ compilers, which are similar for both GCC and Clang; the compile time of Minerva SoC SRAM increased almost twofold. Although one would expect the slowdown to be caused by the increased use of templates in `debug_eval()`, it is actually almost entirely attributable to optimizations and codegen for `debug_items()`. Fortunately, it is neither possible nor desirable to optimize `debug_items()`: in most cases it is called exactly once, and its body is a linear sequence of calls with unique arguments. This commit turns off optimizations for `debug_items()` on GCC and Clang, improving -Os compile time of Minerva SoC SRAM by ~40% (!) --- backends/cxxrtl/cxxrtl.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'backends/cxxrtl/cxxrtl.h') diff --git a/backends/cxxrtl/cxxrtl.h b/backends/cxxrtl/cxxrtl.h index 59393e415..3c315c7df 100644 --- a/backends/cxxrtl/cxxrtl.h +++ b/backends/cxxrtl/cxxrtl.h @@ -41,18 +41,29 @@ #include +#ifndef __has_attribute +# define __has_attribute(x) 0 +#endif + // CXXRTL essentially uses the C++ compiler as a hygienic macro engine that feeds an instruction selector. // It generates a lot of specialized template functions with relatively large bodies that, when inlined // into the caller and (for those with loops) unrolled, often expose many new optimization opportunities. // Because of this, most of the CXXRTL runtime must be always inlined for best performance. -#ifndef __has_attribute -# define __has_attribute(x) 0 -#endif #if __has_attribute(always_inline) #define CXXRTL_ALWAYS_INLINE inline __attribute__((__always_inline__)) #else #define CXXRTL_ALWAYS_INLINE inline #endif +// Conversely, some functions in the generated code are extremely large yet very cold, with both of these +// properties being extreme enough to confuse C++ compilers into spending pathological amounts of time +// on a futile (the code becomes worse) attempt to optimize the least important parts of code. +#if __has_attribute(optnone) +#define CXXRTL_EXTREMELY_COLD __attribute__((__optnone__)) +#elif __has_attribute(optimize) +#define CXXRTL_EXTREMELY_COLD __attribute__((__optimize__(0))) +#else +#define CXXRTL_EXTREMELY_COLD +#endif // CXXRTL uses assert() to check for C++ contract violations (which may result in e.g. undefined behavior // of the simulation code itself), and CXXRTL_ASSERT to check for RTL contract violations (which may at -- cgit v1.2.3