5 files changed, 730 insertions, 545 deletions
diff --git a/common/place/detail_place_cfg.h b/common/place/detail_place_cfg.h
new file mode 100644
index 00000000..dadda0e8
--- /dev/null
+++ b/common/place/detail_place_cfg.h
@@ -0,0 +1,36 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2021-22  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef DETAIL_PLACE_CFG_H
+#define DETAIL_PLACE_CFG_H
+
+#include "nextpnr.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+struct DetailPlaceCfg
+{
+    DetailPlaceCfg(Context *ctx);
+    bool timing_driven;
+    int hpwl_scale_x, hpwl_scale_y;
+    float crit_exp = 8;
+};
+
+NEXTPNR_NAMESPACE_END
+#endif
diff --git a/common/place/detail_place_core.cc b/common/place/detail_place_core.cc
new file mode 100644
index 00000000..bb383639
--- /dev/null
+++ b/common/place/detail_place_core.cc
@@ -0,0 +1,457 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2021-22  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "detail_place_core.h"
+#include "nextpnr.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+DetailPlaceCfg::DetailPlaceCfg(Context *ctx)
+{
+    timing_driven = ctx->setting<bool>("timing_driven");
+
+    hpwl_scale_x = 1;
+    hpwl_scale_y = 1;
+}
+
+PlacePartition::PlacePartition(Context *ctx)
+{
+    x0 = ctx->getGridDimX();
+    y0 = ctx->getGridDimY();
+    x1 = 0;
+    y1 = 0;
+    for (auto &cell : ctx->cells) {
+        Loc l = ctx->getBelLocation(cell.second->bel);
+        x0 = std::min(x0, l.x);
+        x1 = std::max(x1, l.x);
+        y0 = std::min(y0, l.y);
+        y1 = std::max(y1, l.y);
+        cells.push_back(cell.second.get());
+    }
+}
+
+void PlacePartition::split(Context *ctx, bool yaxis, float pivot, PlacePartition &l, PlacePartition &r)
+{
+    std::sort(cells.begin(), cells.end(), [&](CellInfo *a, CellInfo *b) {
+        Loc l0 = ctx->getBelLocation(a->bel), l1 = ctx->getBelLocation(b->bel);
+        return yaxis ? (l0.y < l1.y) : (l0.x < l1.x);
+    });
+    size_t pivot_point = size_t(cells.size() * pivot);
+    l.cells.clear();
+    r.cells.clear();
+    l.cells.reserve(pivot_point);
+    r.cells.reserve(cells.size() - pivot_point);
+    int pivot_coord = (pivot_point == 0) ? (yaxis ? y1 : x1)
+                                         : (yaxis ? ctx->getBelLocation(cells.at(pivot_point - 1)->bel).y
+                                                  : ctx->getBelLocation(cells.at(pivot_point - 1)->bel).x);
+    for (size_t i = 0; i < cells.size(); i++) {
+        Loc loc = ctx->getBelLocation(cells.at(i)->bel);
+        ((yaxis ? loc.y : loc.x) <= pivot_coord ? l.cells : r.cells).push_back(cells.at(i));
+    }
+    if (yaxis) {
+        l.x0 = r.x0 = x0;
+        l.x1 = r.x1 = x1;
+        l.y0 = y0;
+        l.y1 = pivot_coord;
+        r.y0 = (pivot_coord == y1) ? y1 : (pivot_coord + 1);
+        r.y1 = y1;
+    } else {
+        l.y0 = r.y0 = y0;
+        l.y1 = r.y1 = y1;
+        l.x0 = x0;
+        l.x1 = pivot_coord;
+        r.x0 = (pivot_coord == x1) ? x1 : (pivot_coord + 1);
+        r.x1 = x1;
+    }
+}
+
+void DetailPlacerState::update_global_costs()
+{
+    last_bounds.resize(flat_nets.size());
+    last_tmg_costs.resize(flat_nets.size());
+    total_wirelen = 0;
+    total_timing_cost = 0;
+    for (size_t i = 0; i < flat_nets.size(); i++) {
+        NetInfo *ni = flat_nets.at(i);
+        if (skip_net(ni))
+            continue;
+        last_bounds.at(i) = NetBB::compute(ctx, ni);
+        total_wirelen += last_bounds.at(i).hpwl(base_cfg);
+        if (!timing_skip_net(ni)) {
+            auto &tc = last_tmg_costs.at(i);
+            tc.resize(ni->users.capacity());
+            for (auto usr : ni->users.enumerate()) {
+                tc.at(usr.index.idx()) = get_timing_cost(ni, usr.index);
+                total_timing_cost += tc.at(usr.index.idx());
+            }
+        }
+    }
+}
+
+NetBB NetBB::compute(const Context *ctx, const NetInfo *net, const dict<IdString, BelId> *cell2bel)
+{
+    NetBB result{};
+    if (!net->driver.cell)
+        return result;
+    auto bel_loc = [&](const CellInfo *cell) {
+        BelId bel = cell2bel ? cell2bel->at(cell->name) : cell->bel;
+        return ctx->getBelLocation(bel);
+    };
+    result.nx0 = result.nx1 = result.ny0 = result.ny1 = 1;
+    Loc drv_loc = bel_loc(net->driver.cell);
+    result.x0 = result.x1 = drv_loc.x;
+    result.y0 = result.y1 = drv_loc.y;
+    for (auto &usr : net->users) {
+        Loc l = bel_loc(usr.cell);
+        if (l.x == result.x0)
+            ++result.nx0; // on the edge
+        else if (l.x < result.x0) {
+            result.x0 = l.x; // extends the edge
+            result.nx0 = 1;
+        }
+        if (l.x == result.x1)
+            ++result.nx1; // on the edge
+        else if (l.x > result.x1) {
+            result.x1 = l.x; // extends the edge
+            result.nx1 = 1;
+        }
+        if (l.y == result.y0)
+            ++result.ny0; // on the edge
+        else if (l.y < result.y0) {
+            result.y0 = l.y; // extends the edge
+            result.ny0 = 1;
+        }
+        if (l.y == result.y1)
+            ++result.ny1; // on the edge
+        else if (l.y > result.y1) {
+            result.y1 = l.y; // extends the edge
+            result.ny1 = 1;
+        }
+    }
+    return result;
+}
+
+void DetailPlacerThreadState::set_partition(const PlacePartition &part)
+{
+    p = part;
+    thread_nets.clear();
+    thread_net_idx.resize(g.flat_nets.size());
+    std::fill(thread_net_idx.begin(), thread_net_idx.end(), -1);
+    // Determine the set of nets that are within the thread; and therefore we care about
+    for (auto thread_cell : part.cells) {
+        for (auto &port : thread_cell->ports) {
+            if (!port.second.net)
+                continue;
+            int global_idx = port.second.net->udata;
+            auto &thread_idx = thread_net_idx.at(global_idx);
+            // Already added to the set
+            if (thread_idx != -1)
+                continue;
+            thread_idx = thread_nets.size();
+            thread_nets.push_back(port.second.net);
+        }
+    }
+    tmg_ignored_nets.clear();
+    ignored_nets.clear();
+    for (auto tn : thread_nets) {
+        ignored_nets.push_back(g.skip_net(tn));
+        tmg_ignored_nets.push_back(g.timing_skip_net(tn));
+    }
+    // Set up the original cell-bel map for all nets inside the thread
+    local_cell2bel.clear();
+    for (NetInfo *net : thread_nets) {
+        if (net->driver.cell)
+            local_cell2bel[net->driver.cell->name] = net->driver.cell->bel;
+        for (auto &usr : net->users)
+            local_cell2bel[usr.cell->name] = usr.cell->bel;
+    }
+}
+
+void DetailPlacerThreadState::setup_initial_state()
+{
+    // Setup initial net bounding boxes and timing costs
+    net_bounds.clear();
+    arc_tmg_cost.clear();
+    for (auto tn : thread_nets) {
+        net_bounds.push_back(g.last_bounds.at(tn->udata));
+        arc_tmg_cost.push_back(g.last_tmg_costs.at(tn->udata));
+    }
+    new_net_bounds = net_bounds;
+    for (int j = 0; j < 2; j++) {
+        auto &a = axes.at(j);
+        a.already_bounds_changed.resize(net_bounds.size());
+    }
+    already_timing_changed.clear();
+    already_timing_changed.resize(net_bounds.size());
+    for (size_t i = 0; i < thread_nets.size(); i++)
+        already_timing_changed.at(i) = std::vector<bool>(thread_nets.at(i)->users.capacity());
+}
+
+bool DetailPlacerThreadState::bounds_check(BelId bel)
+{
+    Loc l = ctx->getBelLocation(bel);
+    if (l.x < p.x0 || l.x > p.x1 || l.y < p.y0 || l.y > p.y1)
+        return false;
+    return true;
+}
+
+bool DetailPlacerThreadState::bind_move()
+{
+#if !defined(__wasm)
+    std::unique_lock<std::shared_timed_mutex> l(g.archapi_mutex);
+#endif
+    for (auto &entry : moved_cells) {
+        ctx->unbindBel(entry.second.first);
+    }
+    bool success = true;
+    for (auto &entry : moved_cells) {
+        // Make sure targets are available before we bind them
+        if (!ctx->checkBelAvail(entry.second.second)) {
+            success = false;
+            break;
+        }
+        ctx->bindBel(entry.second.second, ctx->cells.at(entry.first).get(), STRENGTH_WEAK);
+    }
+    arch_state_dirty = true;
+    return success;
+}
+
+bool DetailPlacerThreadState::check_validity()
+{
+#if !defined(__wasm)
+    std::shared_lock<std::shared_timed_mutex> l(g.archapi_mutex);
+#endif
+    bool result = true;
+    for (auto e : moved_cells) {
+        if (!ctx->isBelLocationValid(e.second.first)) {
+            // Have to check old; too; as unbinding a bel could make a placement illegal by virtue of no longer
+            // enabling dedicated routes to be used
+            result = false;
+            break;
+        }
+        if (!ctx->isBelLocationValid(e.second.second)) {
+            result = false;
+            break;
+        }
+    }
+    return result;
+}
+
+void DetailPlacerThreadState::revert_move()
+{
+    if (arch_state_dirty) {
+        // If changes to the arch state were made, revert them by restoring original cell bindings
+#if !defined(__wasm)
+        std::unique_lock<std::shared_timed_mutex> l(g.archapi_mutex);
+#endif
+        for (auto &entry : moved_cells) {
+            BelId curr_bound = ctx->cells.at(entry.first)->bel;
+            if (curr_bound != BelId())
+                ctx->unbindBel(curr_bound);
+        }
+        for (auto &entry : moved_cells) {
+            ctx->bindBel(entry.second.first, ctx->cells.at(entry.first).get(), STRENGTH_WEAK);
+        }
+        arch_state_dirty = false;
+    }
+    for (auto &entry : moved_cells)
+        local_cell2bel[entry.first] = entry.second.first;
+}
+
+void DetailPlacerThreadState::commit_move()
+{
+    arch_state_dirty = false;
+    for (auto &axis : axes) {
+        for (auto bc : axis.bounds_changed_nets) {
+            // Commit updated net bounds
+            net_bounds.at(bc) = new_net_bounds.at(bc);
+        }
+    }
+    if (g.base_cfg.timing_driven) {
+        NPNR_ASSERT(timing_changed_arcs.size() == new_timing_costs.size());
+        for (size_t i = 0; i < timing_changed_arcs.size(); i++) {
+            auto arc = timing_changed_arcs.at(i);
+            arc_tmg_cost.at(arc.first).at(arc.second.idx()) = new_timing_costs.at(i);
+        }
+    }
+}
+
+void DetailPlacerThreadState::compute_changes_for_cell(CellInfo *cell, BelId old_bel, BelId new_bel)
+{
+    Loc new_loc = ctx->getBelLocation(new_bel);
+    Loc old_loc = ctx->getBelLocation(old_bel);
+    for (const auto &port : cell->ports) {
+        NetInfo *pn = port.second.net;
+        if (!pn)
+            continue;
+        int idx = thread_net_idx.at(pn->udata);
+        if (ignored_nets.at(idx))
+            continue;
+        NetBB &new_bounds = new_net_bounds.at(idx);
+        // For the x-axis (i=0) and y-axis (i=1)
+        for (int i = 0; i < 2; i++) {
+            auto &axis = axes.at(i);
+            // New and old on this axis
+            int new_pos = i ? new_loc.y : new_loc.x, old_pos = i ? old_loc.y : old_loc.x;
+            // References to updated bounding box entries
+            auto &b0 = i ? new_bounds.y0 : new_bounds.x0;
+            auto &n0 = i ? new_bounds.ny0 : new_bounds.nx0;
+            auto &b1 = i ? new_bounds.y1 : new_bounds.x1;
+            auto &n1 = i ? new_bounds.ny1 : new_bounds.nx1;
+            auto &change = axis.already_bounds_changed.at(idx);
+            // Lower bound
+            if (new_pos < b0) {
+                // Further out than current lower bound
+                b0 = new_pos;
+                n0 = 1;
+                if (change == NO_CHANGE) {
+                    change = CELL_MOVED_OUTWARDS;
+                    axis.bounds_changed_nets.push_back(idx);
+                }
+            } else if (new_pos == b0 && old_pos > b0) {
+                // Moved from inside into current bound
+                ++n0;
+                if (change == NO_CHANGE) {
+                    change = CELL_MOVED_OUTWARDS;
+                    axis.bounds_changed_nets.push_back(idx);
+                }
+            } else if (old_pos == b0 && new_pos > b0) {
+                // Moved from current bound to inside
+                if (change == NO_CHANGE)
+                    axis.bounds_changed_nets.push_back(idx);
+                if (n0 == 1) {
+                    // Was the last cell on the bound; have to do a full recompute
+                    change = FULL_RECOMPUTE;
+                } else {
+                    --n0;
+                    if (change == NO_CHANGE)
+                        change = CELL_MOVED_INWARDS;
+                }
+            }
+            // Upper bound
+            if (new_pos > b1) {
+                // Further out than current upper bound
+                b1 = new_pos;
+                n1 = new_pos;
+                if (change == NO_CHANGE) {
+                    change = CELL_MOVED_OUTWARDS;
+                    axis.bounds_changed_nets.push_back(idx);
+                }
+            } else if (new_pos == b1 && old_pos < b1) {
+                // Moved onto current bound
+                ++n1;
+                if (change == NO_CHANGE) {
+                    change = CELL_MOVED_OUTWARDS;
+                    axis.bounds_changed_nets.push_back(idx);
+                }
+            } else if (old_pos == b1 && new_pos < b1) {
+                // Moved from current bound to inside
+                if (change == NO_CHANGE)
+                    axis.bounds_changed_nets.push_back(idx);
+                if (n1 == 1) {
+                    // Was the last cell on the bound; have to do a full recompute
+                    change = FULL_RECOMPUTE;
+                } else {
+                    --n1;
+                    if (change == NO_CHANGE)
+                        change = CELL_MOVED_INWARDS;
+                }
+            }
+        }
+        // Timing updates if timing driven
+        if (g.base_cfg.timing_driven && !tmg_ignored_nets.at(idx)) {
+            if (port.second.type == PORT_OUT) {
+                int cc;
+                TimingPortClass cls = ctx->getPortTimingClass(cell, port.first, cc);
+                if (cls != TMG_IGNORE) {
+                    for (auto usr : pn->users.enumerate())
+                        if (!already_timing_changed.at(idx).at(usr.index.idx())) {
+                            timing_changed_arcs.emplace_back(std::make_pair(idx, usr.index));
+                            already_timing_changed.at(idx).at(usr.index.idx()) = true;
+                        }
+                }
+            } else {
+                auto usr = port.second.user_idx;
+                if (!already_timing_changed.at(idx).at(usr.idx())) {
+                    timing_changed_arcs.emplace_back(std::make_pair(idx, usr));
+                    already_timing_changed.at(idx).at(usr.idx()) = true;
+                }
+            }
+        }
+    }
+}
+
+void DetailPlacerThreadState::compute_total_change()
+{
+    auto &xa = axes.at(0), &ya = axes.at(1);
+    for (auto &bc : xa.bounds_changed_nets)
+        if (xa.already_bounds_changed.at(bc) == FULL_RECOMPUTE)
+            new_net_bounds.at(bc) = NetBB::compute(ctx, thread_nets.at(bc), &local_cell2bel);
+    for (auto &bc : ya.bounds_changed_nets)
+        if (xa.already_bounds_changed.at(bc) != FULL_RECOMPUTE && ya.already_bounds_changed.at(bc) == FULL_RECOMPUTE)
+            new_net_bounds.at(bc) = NetBB::compute(ctx, thread_nets.at(bc), &local_cell2bel);
+    for (auto &bc : xa.bounds_changed_nets)
+        wirelen_delta += (new_net_bounds.at(bc).hpwl(g.base_cfg) - net_bounds.at(bc).hpwl(g.base_cfg));
+    for (auto &bc : ya.bounds_changed_nets)
+        if (xa.already_bounds_changed.at(bc) == NO_CHANGE)
+            wirelen_delta += (new_net_bounds.at(bc).hpwl(g.base_cfg) - net_bounds.at(bc).hpwl(g.base_cfg));
+    if (g.base_cfg.timing_driven) {
+        NPNR_ASSERT(new_timing_costs.empty());
+        for (auto arc : timing_changed_arcs) {
+            double new_cost = g.get_timing_cost(thread_nets.at(arc.first), arc.second, &local_cell2bel);
+            timing_delta += (new_cost - arc_tmg_cost.at(arc.first).at(arc.second.idx()));
+            new_timing_costs.push_back(new_cost);
+        }
+    }
+}
+
+void DetailPlacerThreadState::reset_move_state()
+{
+    moved_cells.clear();
+    cell_rel.clear();
+    for (auto &axis : axes) {
+        for (auto bc : axis.bounds_changed_nets) {
+            new_net_bounds.at(bc) = net_bounds.at(bc);
+            axis.already_bounds_changed[bc] = NO_CHANGE;
+        }
+        axis.bounds_changed_nets.clear();
+    }
+    for (auto &arc : timing_changed_arcs) {
+        already_timing_changed.at(arc.first).at(arc.second.idx()) = false;
+    }
+    timing_changed_arcs.clear();
+    new_timing_costs.clear();
+    wirelen_delta = 0;
+    timing_delta = 0;
+}
+
+bool DetailPlacerThreadState::add_to_move(CellInfo *cell, BelId old_bel, BelId new_bel)
+{
+    if (!bounds_check(old_bel) || !bounds_check(new_bel))
+        return false;
+    if (!ctx->isValidBelForCellType(cell->type, new_bel))
+        return false;
+    NPNR_ASSERT(!moved_cells.count(cell->name));
+    moved_cells[cell->name] = std::make_pair(old_bel, new_bel);
+    local_cell2bel[cell->name] = new_bel;
+    compute_changes_for_cell(cell, old_bel, new_bel);
+    return true;
+}
+
+NEXTPNR_NAMESPACE_END
diff --git a/common/place/detail_place_core.h b/common/place/detail_place_core.h
new file mode 100644
index 00000000..1c280f24
--- /dev/null
+++ b/common/place/detail_place_core.h
@@ -0,0 +1,224 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2021-22  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+/*
+This provides core data structures for a thread-safe detail placer that needs to swap cells and evaluate the cost
+changes of swaps.
+
+It works on a partition-based threading approach; although threading can be avoided by only instantiating one
+per-thread structure and calling its methods from the main thread.
+
+Each thread's data includes its own local net indexing for nets inside the partition (which can overlap thread
+boundaries); and its own local cell-to-bel mapping for any cells on those nets, so there are no races with moves
+made by other threads.
+
+A move is an atomic transaction of updated cell to bel mappings inside a thread. The first step is to reset the
+per-move structures; then to add all of the moved cells to the move with add_to_move.
+
+Evaluation of wirelength and timing changes of a move is done with compute_changes_for_cell and compute_total_change.
+
+bind_move will probationally bind the move using the arch API functions, acquiring a lock during this time to prevent
+races on non-thread-safe arch implementations, returning true if the bind succeeded or false if something went wrong
+and it should be aborted. check_validity must then be called to use the arch API validity check functions on the move.
+
+Finally if the move meets criteria and is accepted then commit_move marks it as committed, otherwise revert_move
+aborts the entire move transaction.
+*/
+
+#ifndef DETAIL_PLACE_CORE_H
+#define DETAIL_PLACE_CORE_H
+
+#include "nextpnr.h"
+
+#include "detail_place_cfg.h"
+#include "fast_bels.h"
+#include "timing.h"
+
+#include <queue>
+
+#if !defined(__wasm)
+#include <shared_mutex>
+#endif
+
+NEXTPNR_NAMESPACE_BEGIN
+
+struct PlacePartition
+{
+    int x0, y0, x1, y1;
+    std::vector<CellInfo *> cells;
+    PlacePartition() = default;
+    explicit PlacePartition(Context *ctx);
+    void split(Context *ctx, bool yaxis, float pivot, PlacePartition &l, PlacePartition &r);
+};
+
+typedef int64_t wirelen_t;
+
+struct NetBB
+{
+    // Actual bounding box
+    int x0 = 0, x1 = 0, y0 = 0, y1 = 0;
+    // Number of cells at each extremity
+    int nx0 = 0, nx1 = 0, ny0 = 0, ny1 = 0;
+    inline wirelen_t hpwl(const DetailPlaceCfg &cfg) const
+    {
+        return wirelen_t(cfg.hpwl_scale_x * (x1 - x0) + cfg.hpwl_scale_y * (y1 - y0));
+    }
+    static NetBB compute(const Context *ctx, const NetInfo *net, const dict<IdString, BelId> *cell2bel = nullptr);
+};
+
+struct DetailPlacerState
+{
+    explicit DetailPlacerState(Context *ctx, DetailPlaceCfg &cfg)
+            : ctx(ctx), base_cfg(cfg), bels(ctx, false, 64), tmg(ctx){};
+    Context *ctx;
+    DetailPlaceCfg &base_cfg;
+    FastBels bels;
+    std::vector<NetInfo *> flat_nets; // flat array of all nets in the design for fast referencing by index
+    std::vector<NetBB> last_bounds;
+    std::vector<std::vector<double>> last_tmg_costs;
+    dict<IdString, NetBB> region_bounds;
+    TimingAnalyser tmg;
+
+    wirelen_t total_wirelen = 0;
+    double total_timing_cost = 0;
+
+#if !defined(__wasm)
+    std::shared_timed_mutex archapi_mutex;
+#endif
+
+    inline double get_timing_cost(const NetInfo *net, store_index<PortRef> user,
+                                  const dict<IdString, BelId> *cell2bel = nullptr)
+    {
+        if (!net->driver.cell)
+            return 0;
+        const auto &sink = net->users.at(user);
+        IdString driver_pin, sink_pin;
+        // Pick the first pin for a prediction; assume all will be similar enouhg
+        for (auto pin : ctx->getBelPinsForCellPin(net->driver.cell, net->driver.port)) {
+            driver_pin = pin;
+            break;
+        }
+        for (auto pin : ctx->getBelPinsForCellPin(sink.cell, sink.port)) {
+            sink_pin = pin;
+            break;
+        }
+        float crit = tmg.get_criticality(CellPortKey(sink));
+        BelId src_bel = cell2bel ? cell2bel->at(net->driver.cell->name) : net->driver.cell->bel;
+        BelId dst_bel = cell2bel ? cell2bel->at(sink.cell->name) : sink.cell->bel;
+        double delay = ctx->getDelayNS(ctx->predictDelay(src_bel, driver_pin, dst_bel, sink_pin));
+        return delay * std::pow(crit, base_cfg.crit_exp);
+    }
+
+    inline bool skip_net(const NetInfo *net) const
+    {
+        if (!net->driver.cell)
+            return true;
+        if (ctx->getBelGlobalBuf(net->driver.cell->bel))
+            return true;
+        return false;
+    }
+
+    inline bool timing_skip_net(const NetInfo *net) const
+    {
+        if (!net->driver.cell)
+            return true;
+        int cc;
+        auto cls = ctx->getPortTimingClass(net->driver.cell, net->driver.port, cc);
+        if (cls == TMG_IGNORE || cls == TMG_GEN_CLOCK)
+            return true;
+        return false;
+    }
+
+    void update_global_costs();
+};
+
+struct DetailPlacerThreadState
+{
+    Context *ctx;         // Nextpnr context pointer
+    DetailPlacerState &g; // Placer engine state
+    int idx;              // Index of the thread
+    DeterministicRNG rng; // Local RNG
+    // The cell partition that the thread works on
+    PlacePartition p;
+    // Mapping from design-wide net index to thread-wide net index -- not all nets are in all partitions, so we can
+    // optimise
+    std::vector<int> thread_net_idx;
+    // List of nets inside the partition; and their committed bounding boxes & timing costs from the thread's
+    // perspective
+    std::vector<NetInfo *> thread_nets;
+    std::vector<NetBB> net_bounds;
+    std::vector<std::vector<double>> arc_tmg_cost;
+    std::vector<bool> ignored_nets, tmg_ignored_nets;
+    bool arch_state_dirty = false;
+    // Our local cell-bel map; that won't be affected by out-of-partition moves
+    dict<IdString, BelId> local_cell2bel;
+
+    // Data on an inflight move
+    dict<IdString, std::pair<BelId, BelId>> moved_cells; // cell -> (old; new)
+    // For cluster moves only
+    std::vector<std::pair<CellInfo *, Loc>> cell_rel;
+    // For incremental wirelength and delay updates
+    wirelen_t wirelen_delta = 0;
+    double timing_delta = 0;
+    // Wirelen related are handled on a per-axis basis to reduce
+    enum BoundChange
+    {
+        NO_CHANGE,
+        CELL_MOVED_INWARDS,
+        CELL_MOVED_OUTWARDS,
+        FULL_RECOMPUTE
+    };
+    struct AxisChanges
+    {
+        std::vector<int> bounds_changed_nets;
+        std::vector<BoundChange> already_bounds_changed;
+    };
+    std::array<AxisChanges, 2> axes;
+    std::vector<NetBB> new_net_bounds;
+
+    std::vector<std::vector<bool>> already_timing_changed;
+    std::vector<std::pair<int, store_index<PortRef>>> timing_changed_arcs;
+    std::vector<double> new_timing_costs;
+
+    DetailPlacerThreadState(Context *ctx, DetailPlacerState &g, int idx) : ctx(ctx), g(g), idx(idx){};
+    void set_partition(const PlacePartition &part);
+    void setup_initial_state();
+    bool bounds_check(BelId bel);
+
+    // Reset the inflight move state
+    void reset_move_state();
+    // Add a cell change to the move
+    bool add_to_move(CellInfo *cell, BelId old_bel, BelId new_bel);
+    // For an inflight move; attempt to actually apply the changes to the arch API
+    bool bind_move();
+    // Checks if the arch API bel validity for a move is accepted
+    bool check_validity();
+    // Undo any changes relating to an inflight move
+    void revert_move();
+    // Mark the inflight move as complete and update cost structures
+    void commit_move();
+    // Update the inflight cost change structures for a given cell moe
+    void compute_changes_for_cell(CellInfo *cell, BelId old_bel, BelId new_bel);
+    // Update the total cost change for an inflight move
+    void compute_total_change();
+};
+
+NEXTPNR_NAMESPACE_END
+
+#endif
diff --git a/common/place/parallel_refine.cc b/common/place/parallel_refine.cc
index a868ca58..a2de5b13 100644
--- a/common/place/parallel_refine.cc
+++ b/common/place/parallel_refine.cc
@@ -22,9 +22,8 @@
 
 #if !defined(__wasm)
 
-#include "fast_bels.h"
+#include "detail_place_core.h"
 #include "scope_lock.h"
-#include "timing.h"
 
 #include <chrono>
 #include <mutex>
@@ -35,515 +34,24 @@
 NEXTPNR_NAMESPACE_BEGIN
 
 namespace {
-struct Partition
-{
-    int x0, y0, x1, y1;
-    std::vector<CellInfo *> cells;
-    Partition() = default;
-    explicit Partition(Context *ctx)
-    {
-        x0 = ctx->getGridDimX();
-        y0 = ctx->getGridDimY();
-        x1 = 0;
-        y1 = 0;
-        for (auto &cell : ctx->cells) {
-            Loc l = ctx->getBelLocation(cell.second->bel);
-            x0 = std::min(x0, l.x);
-            x1 = std::max(x1, l.x);
-            y0 = std::min(y0, l.y);
-            y1 = std::max(y1, l.y);
-            cells.push_back(cell.second.get());
-        }
-    }
-    void split(Context *ctx, bool yaxis, float pivot, Partition &l, Partition &r)
-    {
-        std::sort(cells.begin(), cells.end(), [&](CellInfo *a, CellInfo *b) {
-            Loc l0 = ctx->getBelLocation(a->bel), l1 = ctx->getBelLocation(b->bel);
-            return yaxis ? (l0.y < l1.y) : (l0.x < l1.x);
-        });
-        size_t pivot_point = size_t(cells.size() * pivot);
-        l.cells.clear();
-        r.cells.clear();
-        l.cells.reserve(pivot_point);
-        r.cells.reserve(cells.size() - pivot_point);
-        int pivot_coord = (pivot_point == 0) ? (yaxis ? y1 : x1)
-                                             : (yaxis ? ctx->getBelLocation(cells.at(pivot_point - 1)->bel).y
-                                                      : ctx->getBelLocation(cells.at(pivot_point - 1)->bel).x);
-        for (size_t i = 0; i < cells.size(); i++) {
-            Loc loc = ctx->getBelLocation(cells.at(i)->bel);
-            ((yaxis ? loc.y : loc.x) <= pivot_coord ? l.cells : r.cells).push_back(cells.at(i));
-        }
-        if (yaxis) {
-            l.x0 = r.x0 = x0;
-            l.x1 = r.x1 = x1;
-            l.y0 = y0;
-            l.y1 = pivot_coord;
-            r.y0 = (pivot_coord == y1) ? y1 : (pivot_coord + 1);
-            r.y1 = y1;
-        } else {
-            l.y0 = r.y0 = y0;
-            l.y1 = r.y1 = y1;
-            l.x0 = x0;
-            l.x1 = pivot_coord;
-            r.x0 = (pivot_coord == x1) ? x1 : (pivot_coord + 1);
-            r.x1 = x1;
-        }
-    }
-};
 
-typedef int64_t wirelen_t;
-
-struct NetBB
+struct GlobalState : DetailPlacerState
 {
-    // Actual bounding box
-    int x0 = 0, x1 = 0, y0 = 0, y1 = 0;
-    // Number of cells at each extremity
-    int nx0 = 0, nx1 = 0, ny0 = 0, ny1 = 0;
-    wirelen_t hpwl(const ParallelRefineCfg &cfg) const
-    {
-        return wirelen_t(cfg.hpwl_scale_x * (x1 - x0) + cfg.hpwl_scale_y * (y1 - y0));
-    }
-    static NetBB compute(const Context *ctx, const NetInfo *net, const dict<IdString, BelId> *cell2bel = nullptr)
-    {
-        NetBB result{};
-        if (!net->driver.cell)
-            return result;
-        auto bel_loc = [&](const CellInfo *cell) {
-            BelId bel = cell2bel ? cell2bel->at(cell->name) : cell->bel;
-            return ctx->getBelLocation(bel);
-        };
-        result.nx0 = result.nx1 = result.ny0 = result.ny1 = 1;
-        Loc drv_loc = bel_loc(net->driver.cell);
-        result.x0 = result.x1 = drv_loc.x;
-        result.y0 = result.y1 = drv_loc.y;
-        for (auto &usr : net->users) {
-            Loc l = bel_loc(usr.cell);
-            if (l.x == result.x0)
-                ++result.nx0; // on the edge
-            else if (l.x < result.x0) {
-                result.x0 = l.x; // extends the edge
-                result.nx0 = 1;
-            }
-            if (l.x == result.x1)
-                ++result.nx1; // on the edge
-            else if (l.x > result.x1) {
-                result.x1 = l.x; // extends the edge
-                result.nx1 = 1;
-            }
-            if (l.y == result.y0)
-                ++result.ny0; // on the edge
-            else if (l.y < result.y0) {
-                result.y0 = l.y; // extends the edge
-                result.ny0 = 1;
-            }
-            if (l.y == result.y1)
-                ++result.ny1; // on the edge
-            else if (l.y > result.y1) {
-                result.y1 = l.y; // extends the edge
-                result.ny1 = 1;
-            }
-        }
-        return result;
-    }
-};
+    explicit GlobalState(Context *ctx, ParallelRefineCfg cfg) : DetailPlacerState(ctx, this->cfg), cfg(cfg){};
 
-struct GlobalState
-{
-    explicit GlobalState(Context *ctx, ParallelRefineCfg cfg) : ctx(ctx), cfg(cfg), bels(ctx, false, 64), tmg(ctx){};
-    Context *ctx;
     ParallelRefineCfg cfg;
-    FastBels bels;
-    std::vector<NetInfo *> flat_nets; // flat array of all nets in the design for fast referencing by index
-    std::vector<NetBB> last_bounds;
-    std::vector<std::vector<double>> last_tmg_costs;
-    dict<IdString, NetBB> region_bounds;
-    TimingAnalyser tmg;
-
-    std::shared_timed_mutex archapi_mutex;
-
     double temperature = 1e-7;
     int radius = 3;
-
-    wirelen_t total_wirelen = 0;
-    double total_timing_cost = 0;
-
-    double get_timing_cost(const NetInfo *net, store_index<PortRef> user,
-                           const dict<IdString, BelId> *cell2bel = nullptr)
-    {
-        if (!net->driver.cell)
-            return 0;
-        const auto &sink = net->users.at(user);
-        IdString driver_pin, sink_pin;
-        // Pick the first pin for a prediction; assume all will be similar enouhg
-        for (auto pin : ctx->getBelPinsForCellPin(net->driver.cell, net->driver.port)) {
-            driver_pin = pin;
-            break;
-        }
-        for (auto pin : ctx->getBelPinsForCellPin(sink.cell, sink.port)) {
-            sink_pin = pin;
-            break;
-        }
-        float crit = tmg.get_criticality(CellPortKey(sink));
-        BelId src_bel = cell2bel ? cell2bel->at(net->driver.cell->name) : net->driver.cell->bel;
-        BelId dst_bel = cell2bel ? cell2bel->at(sink.cell->name) : sink.cell->bel;
-        double delay = ctx->getDelayNS(ctx->predictDelay(src_bel, driver_pin, dst_bel, sink_pin));
-        return delay * std::pow(crit, cfg.crit_exp);
-    }
-
-    bool skip_net(const NetInfo *net) const
-    {
-        if (!net->driver.cell)
-            return true;
-        if (ctx->getBelGlobalBuf(net->driver.cell->bel))
-            return true;
-        return false;
-    }
-    bool timing_skip_net(const NetInfo *net) const
-    {
-        if (!net->driver.cell)
-            return true;
-        int cc;
-        auto cls = ctx->getPortTimingClass(net->driver.cell, net->driver.port, cc);
-        if (cls == TMG_IGNORE || cls == TMG_GEN_CLOCK)
-            return true;
-        return false;
-    }
     // ....
 };
 
-struct ThreadState
+struct ThreadState : DetailPlacerThreadState
 {
-    Context *ctx;         // Nextpnr context pointer
-    GlobalState &g;       // Refinement engine state
-    int idx;              // Index of the thread
-    DeterministicRNG rng; // Local RNG
-    // The cell partition that the thread works on
-    Partition p;
-    // Mapping from design-wide net index to thread-wide net index -- not all nets are in all partitions, so we can
-    // optimise
-    std::vector<int> thread_net_idx;
-    // List of nets inside the partition; and their committed bounding boxes & timing costs from the thread's
-    // perspective
-    std::vector<NetInfo *> thread_nets;
-    std::vector<NetBB> net_bounds;
-    std::vector<std::vector<double>> arc_tmg_cost;
-    std::vector<bool> ignored_nets, tmg_ignored_nets;
-    bool arch_state_dirty = false;
-    // Our local cell-bel map; that won't be affected by out-of-partition moves
-    dict<IdString, BelId> local_cell2bel;
-
-    // Data on an inflight move
-    dict<IdString, std::pair<BelId, BelId>> moved_cells; // cell -> (old; new)
-    // For cluster moves only
-    std::vector<std::pair<CellInfo *, Loc>> cell_rel;
-    // For incremental wirelength and delay updates
-    wirelen_t wirelen_delta = 0;
-    double timing_delta = 0;
-
+    ThreadState(Context *ctx, GlobalState &g, int idx) : DetailPlacerThreadState(ctx, g, idx), g(g){};
     // Total made and accepted moved
+    GlobalState &g;
     int n_move = 0, n_accept = 0;
 
-    enum BoundChange
-    {
-        NO_CHANGE,
-        CELL_MOVED_INWARDS,
-        CELL_MOVED_OUTWARDS,
-        FULL_RECOMPUTE
-    };
-    // Wirelen related are handled on a per-axis basis to reduce
-    struct AxisChanges
-    {
-        std::vector<int> bounds_changed_nets;
-        std::vector<BoundChange> already_bounds_changed;
-    };
-    std::array<AxisChanges, 2> axes;
-    std::vector<NetBB> new_net_bounds;
-
-    std::vector<std::vector<bool>> already_timing_changed;
-    std::vector<std::pair<int, store_index<PortRef>>> timing_changed_arcs;
-    std::vector<double> new_timing_costs;
-
-    ThreadState(Context *ctx, GlobalState &g, int idx) : ctx(ctx), g(g), idx(idx){};
-    void set_partition(const Partition &part)
-    {
-        p = part;
-        thread_nets.clear();
-        thread_net_idx.resize(g.flat_nets.size());
-        std::fill(thread_net_idx.begin(), thread_net_idx.end(), -1);
-        // Determine the set of nets that are within the thread; and therefore we care about
-        for (auto thread_cell : part.cells) {
-            for (auto &port : thread_cell->ports) {
-                if (!port.second.net)
-                    continue;
-                int global_idx = port.second.net->udata;
-                auto &thread_idx = thread_net_idx.at(global_idx);
-                // Already added to the set
-                if (thread_idx != -1)
-                    continue;
-                thread_idx = thread_nets.size();
-                thread_nets.push_back(port.second.net);
-            }
-        }
-        tmg_ignored_nets.clear();
-        ignored_nets.clear();
-        for (auto tn : thread_nets) {
-            ignored_nets.push_back(g.skip_net(tn));
-            tmg_ignored_nets.push_back(g.timing_skip_net(tn));
-        }
-        // Set up the original cell-bel map for all nets inside the thread
-        local_cell2bel.clear();
-        for (NetInfo *net : thread_nets) {
-            if (net->driver.cell)
-                local_cell2bel[net->driver.cell->name] = net->driver.cell->bel;
-            for (auto &usr : net->users)
-                local_cell2bel[usr.cell->name] = usr.cell->bel;
-        }
-    }
-    void setup_initial_state()
-    {
-        // Setup initial net bounding boxes and timing costs
-        net_bounds.clear();
-        arc_tmg_cost.clear();
-        for (auto tn : thread_nets) {
-            net_bounds.push_back(g.last_bounds.at(tn->udata));
-            arc_tmg_cost.push_back(g.last_tmg_costs.at(tn->udata));
-        }
-        new_net_bounds = net_bounds;
-        for (int j = 0; j < 2; j++) {
-            auto &a = axes.at(j);
-            a.already_bounds_changed.resize(net_bounds.size());
-        }
-        already_timing_changed.clear();
-        already_timing_changed.resize(net_bounds.size());
-        for (size_t i = 0; i < thread_nets.size(); i++)
-            already_timing_changed.at(i) = std::vector<bool>(thread_nets.at(i)->users.capacity());
-    }
-    bool bounds_check(BelId bel)
-    {
-        Loc l = ctx->getBelLocation(bel);
-        if (l.x < p.x0 || l.x > p.x1 || l.y < p.y0 || l.y > p.y1)
-            return false;
-        return true;
-    }
-    bool bind_move()
-    {
-        std::unique_lock<std::shared_timed_mutex> l(g.archapi_mutex);
-        for (auto &entry : moved_cells) {
-            ctx->unbindBel(entry.second.first);
-        }
-        bool success = true;
-        for (auto &entry : moved_cells) {
-            // Make sure targets are available before we bind them
-            if (!ctx->checkBelAvail(entry.second.second)) {
-                success = false;
-                break;
-            }
-            ctx->bindBel(entry.second.second, ctx->cells.at(entry.first).get(), STRENGTH_WEAK);
-        }
-        arch_state_dirty = true;
-        return success;
-    }
-    bool check_validity()
-    {
-        std::shared_lock<std::shared_timed_mutex> l(g.archapi_mutex);
-        bool result = true;
-        for (auto e : moved_cells) {
-            if (!ctx->isBelLocationValid(e.second.first)) {
-                // Have to check old; too; as unbinding a bel could make a placement illegal by virtue of no longer
-                // enabling dedicated routes to be used
-                result = false;
-                break;
-            }
-            if (!ctx->isBelLocationValid(e.second.second)) {
-                result = false;
-                break;
-            }
-        }
-        return result;
-    }
-    void revert_move()
-    {
-        if (arch_state_dirty) {
-            // If changes to the arch state were made, revert them by restoring original cell bindings
-            std::unique_lock<std::shared_timed_mutex> l(g.archapi_mutex);
-            for (auto &entry : moved_cells) {
-                BelId curr_bound = ctx->cells.at(entry.first)->bel;
-                if (curr_bound != BelId())
-                    ctx->unbindBel(curr_bound);
-            }
-            for (auto &entry : moved_cells) {
-                ctx->bindBel(entry.second.first, ctx->cells.at(entry.first).get(), STRENGTH_WEAK);
-            }
-            arch_state_dirty = false;
-        }
-        for (auto &entry : moved_cells)
-            local_cell2bel[entry.first] = entry.second.first;
-    }
-    void commit_move()
-    {
-        arch_state_dirty = false;
-        for (auto &axis : axes) {
-            for (auto bc : axis.bounds_changed_nets) {
-                // Commit updated net bounds
-                net_bounds.at(bc) = new_net_bounds.at(bc);
-            }
-        }
-        if (g.cfg.timing_driven) {
-            NPNR_ASSERT(timing_changed_arcs.size() == new_timing_costs.size());
-            for (size_t i = 0; i < timing_changed_arcs.size(); i++) {
-                auto arc = timing_changed_arcs.at(i);
-                arc_tmg_cost.at(arc.first).at(arc.second.idx()) = new_timing_costs.at(i);
-            }
-        }
-    }
-    void compute_changes_for_cell(CellInfo *cell, BelId old_bel, BelId new_bel)
-    {
-        Loc new_loc = ctx->getBelLocation(new_bel);
-        Loc old_loc = ctx->getBelLocation(old_bel);
-        for (const auto &port : cell->ports) {
-            NetInfo *pn = port.second.net;
-            if (!pn)
-                continue;
-            int idx = thread_net_idx.at(pn->udata);
-            if (ignored_nets.at(idx))
-                continue;
-            NetBB &new_bounds = new_net_bounds.at(idx);
-            // For the x-axis (i=0) and y-axis (i=1)
-            for (int i = 0; i < 2; i++) {
-                auto &axis = axes.at(i);
-                // New and old on this axis
-                int new_pos = i ? new_loc.y : new_loc.x, old_pos = i ? old_loc.y : old_loc.x;
-                // References to updated bounding box entries
-                auto &b0 = i ? new_bounds.y0 : new_bounds.x0;
-                auto &n0 = i ? new_bounds.ny0 : new_bounds.nx0;
-                auto &b1 = i ? new_bounds.y1 : new_bounds.x1;
-                auto &n1 = i ? new_bounds.ny1 : new_bounds.nx1;
-                auto &change = axis.already_bounds_changed.at(idx);
-                // Lower bound
-                if (new_pos < b0) {
-                    // Further out than current lower bound
-                    b0 = new_pos;
-                    n0 = 1;
-                    if (change == NO_CHANGE) {
-                        change = CELL_MOVED_OUTWARDS;
-                        axis.bounds_changed_nets.push_back(idx);
-                    }
-                } else if (new_pos == b0 && old_pos > b0) {
-                    // Moved from inside into current bound
-                    ++n0;
-                    if (change == NO_CHANGE) {
-                        change = CELL_MOVED_OUTWARDS;
-                        axis.bounds_changed_nets.push_back(idx);
-                    }
-                } else if (old_pos == b0 && new_pos > b0) {
-                    // Moved from current bound to inside
-                    if (change == NO_CHANGE)
-                        axis.bounds_changed_nets.push_back(idx);
-                    if (n0 == 1) {
-                        // Was the last cell on the bound; have to do a full recompute
-                        change = FULL_RECOMPUTE;
-                    } else {
-                        --n0;
-                        if (change == NO_CHANGE)
-                            change = CELL_MOVED_INWARDS;
-                    }
-                }
-                // Upper bound
-                if (new_pos > b1) {
-                    // Further out than current upper bound
-                    b1 = new_pos;
-                    n1 = new_pos;
-                    if (change == NO_CHANGE) {
-                        change = CELL_MOVED_OUTWARDS;
-                        axis.bounds_changed_nets.push_back(idx);
-                    }
-                } else if (new_pos == b1 && old_pos < b1) {
-                    // Moved onto current bound
-                    ++n1;
-                    if (change == NO_CHANGE) {
-                        change = CELL_MOVED_OUTWARDS;
-                        axis.bounds_changed_nets.push_back(idx);
-                    }
-                } else if (old_pos == b1 && new_pos < b1) {
-                    // Moved from current bound to inside
-                    if (change == NO_CHANGE)
-                        axis.bounds_changed_nets.push_back(idx);
-                    if (n1 == 1) {
-                        // Was the last cell on the bound; have to do a full recompute
-                        change = FULL_RECOMPUTE;
-                    } else {
-                        --n1;
-                        if (change == NO_CHANGE)
-                            change = CELL_MOVED_INWARDS;
-                    }
-                }
-            }
-            // Timing updates if timing driven
-            if (g.cfg.timing_driven && !tmg_ignored_nets.at(idx)) {
-                if (port.second.type == PORT_OUT) {
-                    int cc;
-                    TimingPortClass cls = ctx->getPortTimingClass(cell, port.first, cc);
-                    if (cls != TMG_IGNORE) {
-                        for (auto usr : pn->users.enumerate())
-                            if (!already_timing_changed.at(idx).at(usr.index.idx())) {
-                                timing_changed_arcs.emplace_back(std::make_pair(idx, usr.index));
-                                already_timing_changed.at(idx).at(usr.index.idx()) = true;
-                            }
-                    }
-                } else {
-                    auto usr = port.second.user_idx;
-                    if (!already_timing_changed.at(idx).at(usr.idx())) {
-                        timing_changed_arcs.emplace_back(std::make_pair(idx, usr));
-                        already_timing_changed.at(idx).at(usr.idx()) = true;
-                    }
-                }
-            }
-        }
-    }
-    void compute_total_change()
-    {
-        auto &xa = axes.at(0), &ya = axes.at(1);
-        for (auto &bc : xa.bounds_changed_nets)
-            if (xa.already_bounds_changed.at(bc) == FULL_RECOMPUTE)
-                new_net_bounds.at(bc) = NetBB::compute(ctx, thread_nets.at(bc), &local_cell2bel);
-        for (auto &bc : ya.bounds_changed_nets)
-            if (xa.already_bounds_changed.at(bc) != FULL_RECOMPUTE &&
-                ya.already_bounds_changed.at(bc) == FULL_RECOMPUTE)
-                new_net_bounds.at(bc) = NetBB::compute(ctx, thread_nets.at(bc), &local_cell2bel);
-        for (auto &bc : xa.bounds_changed_nets)
-            wirelen_delta += (new_net_bounds.at(bc).hpwl(g.cfg) - net_bounds.at(bc).hpwl(g.cfg));
-        for (auto &bc : ya.bounds_changed_nets)
-            if (xa.already_bounds_changed.at(bc) == NO_CHANGE)
-                wirelen_delta += (new_net_bounds.at(bc).hpwl(g.cfg) - net_bounds.at(bc).hpwl(g.cfg));
-        if (g.cfg.timing_driven) {
-            NPNR_ASSERT(new_timing_costs.empty());
-            for (auto arc : timing_changed_arcs) {
-                double new_cost = g.get_timing_cost(thread_nets.at(arc.first), arc.second, &local_cell2bel);
-                timing_delta += (new_cost - arc_tmg_cost.at(arc.first).at(arc.second.idx()));
-                new_timing_costs.push_back(new_cost);
-            }
-        }
-    }
-    void reset_move_state()
-    {
-        moved_cells.clear();
-        cell_rel.clear();
-        for (auto &axis : axes) {
-            for (auto bc : axis.bounds_changed_nets) {
-                new_net_bounds.at(bc) = net_bounds.at(bc);
-                axis.already_bounds_changed[bc] = NO_CHANGE;
-            }
-            axis.bounds_changed_nets.clear();
-        }
-        for (auto &arc : timing_changed_arcs) {
-            already_timing_changed.at(arc.first).at(arc.second.idx()) = false;
-        }
-        timing_changed_arcs.clear();
-        new_timing_costs.clear();
-        wirelen_delta = 0;
-        timing_delta = 0;
-    }
-
     bool accept_move()
     {
         static constexpr double epsilon = 1e-20;
@@ -553,19 +61,6 @@ struct ThreadState
                (g.temperature > 1e-8 && (rng.rng() / float(0x3fffffff)) <= std::exp(-delta / g.temperature));
     }
 
-    bool add_to_move(CellInfo *cell, BelId old_bel, BelId new_bel)
-    {
-        if (!bounds_check(old_bel) || !bounds_check(new_bel))
-            return false;
-        if (!ctx->isValidBelForCellType(cell->type, new_bel))
-            return false;
-        NPNR_ASSERT(!moved_cells.count(cell->name));
-        moved_cells[cell->name] = std::make_pair(old_bel, new_bel);
-        local_cell2bel[cell->name] = new_bel;
-        compute_changes_for_cell(cell, old_bel, new_bel);
-        return true;
-    }
-
     bool single_cell_swap(CellInfo *cell, BelId new_bel)
     {
         NPNR_ASSERT(moved_cells.empty());
@@ -806,14 +301,14 @@ struct ParallelRefine
             g.bels.addCellType(cell_type);
         }
     };
-    std::vector<Partition> parts;
+    std::vector<PlacePartition> parts;
     void do_partition()
     {
         parts.clear();
         parts.emplace_back(ctx);
         bool yaxis = false;
         while (parts.size() < t.size()) {
-            std::vector<Partition> next(parts.size() * 2);
+            std::vector<PlacePartition> next(parts.size() * 2);
             for (size_t i = 0; i < parts.size(); i++) {
                 // Randomly permute pivot every iteration so we get different thread boundaries
                 const float delta = 0.1;
@@ -834,28 +329,6 @@ struct ParallelRefine
             w.join();
     }
 
-    void update_global_costs()
-    {
-        g.last_bounds.resize(g.flat_nets.size());
-        g.last_tmg_costs.resize(g.flat_nets.size());
-        g.total_wirelen = 0;
-        g.total_timing_cost = 0;
-        for (size_t i = 0; i < g.flat_nets.size(); i++) {
-            NetInfo *ni = g.flat_nets.at(i);
-            if (g.skip_net(ni))
-                continue;
-            g.last_bounds.at(i) = NetBB::compute(ctx, ni);
-            g.total_wirelen += g.last_bounds.at(i).hpwl(g.cfg);
-            if (!g.timing_skip_net(ni)) {
-                auto &tc = g.last_tmg_costs.at(i);
-                tc.resize(ni->users.capacity());
-                for (auto usr : ni->users.enumerate()) {
-                    tc.at(usr.index.idx()) = g.get_timing_cost(ni, usr.index);
-                    g.total_timing_cost += tc.at(usr.index.idx());
-                }
-            }
-        }
-    }
     void run()
     {
 
@@ -868,7 +341,7 @@ struct ParallelRefine
         log_info("Running parallel refinement with %d threads.\n", int(t.size()));
         int iter = 1;
         bool done = false;
-        update_global_costs();
+        g.update_global_costs();
         double avg_wirelen = g.total_wirelen;
         wirelen_t min_wirelen = g.total_wirelen;
         while (true) {
@@ -914,7 +387,7 @@ struct ParallelRefine
             for (auto &w : workers)
                 w.join();
             g.tmg.run();
-            update_global_costs();
+            g.update_global_costs();
             iter++;
             ctx->yield();
         }
@@ -924,17 +397,14 @@ struct ParallelRefine
 };
 } // namespace
 
-ParallelRefineCfg::ParallelRefineCfg(Context *ctx)
+ParallelRefineCfg::ParallelRefineCfg(Context *ctx) : DetailPlaceCfg(ctx)
 {
-    timing_driven = ctx->setting<bool>("timing_driven");
     threads = ctx->setting<int>("threads", 8);
     // snap to nearest power of two; and minimum thread size
     int actual_threads = 1;
     while ((actual_threads * 2) <= threads && (int(ctx->cells.size()) / (actual_threads * 2)) >= min_thread_size)
         actual_threads *= 2;
     threads = actual_threads;
-    hpwl_scale_x = 1;
-    hpwl_scale_y = 1;
 }
 
 bool parallel_refine(Context *ctx, ParallelRefineCfg cfg)
diff --git a/common/place/parallel_refine.h b/common/place/parallel_refine.h
index 556317cd..a6c8b469 100644
--- a/common/place/parallel_refine.h
+++ b/common/place/parallel_refine.h
@@ -20,18 +20,16 @@
 #ifndef PARALLEL_REFINE_H
 #define PARALLEL_REFINE_H
 
+#include "detail_place_cfg.h"
 #include "nextpnr.h"
 
 NEXTPNR_NAMESPACE_BEGIN
 
-struct ParallelRefineCfg
+struct ParallelRefineCfg : DetailPlaceCfg
 {
     ParallelRefineCfg(Context *ctx);
-    bool timing_driven;
     int threads;
-    int hpwl_scale_x, hpwl_scale_y;
     double lambda = 0.5f;
-    float crit_exp = 8;
     int inner_iters = 15;
     int min_thread_size = 500;
 };