From 006d6d95a003be26df5b34be959f29725ded0bb6 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 11 Mar 2021 19:08:00 +0000
Subject: [PATCH] media: rpivid: Add a Pass0 to accumulate slices and
 rework job finish

Due to overheads in assembling controls and requests it is worth having
the slice assembly phase separate from the h/w pass1 processing. Create
a queue to service pass1 rather than have the pass1 finished callback
trigger the next slice job.

This requires a rework of the logic that splits up the buffer and
request done events. This code contains two ways of doing that, we use
Ezequiel Garcias <ezequiel@collabora.com> solution, but expect that
in the future this will be handled by the framework in a cleaner manner.

Fix up the handling of some of the memory exhaustion crashes uncovered
in the process of writing this code.

Signed-off-by: John Cox <jc@kynesim.co.uk>
---
 drivers/media/v4l2-core/v4l2-mem2mem.c     |   2 -
 drivers/staging/media/rpivid/rpivid.c      |  11 +-
 drivers/staging/media/rpivid/rpivid.h      |  20 +-
 drivers/staging/media/rpivid/rpivid_dec.c  |  32 +-
 drivers/staging/media/rpivid/rpivid_h265.c | 432 ++++++++++++++++-----
 drivers/staging/media/rpivid/rpivid_hw.c   |   8 +-
 6 files changed, 374 insertions(+), 131 deletions(-)

--- a/drivers/media/v4l2-core/v4l2-mem2mem.c
+++ b/drivers/media/v4l2-core/v4l2-mem2mem.c
@@ -492,8 +492,6 @@ void v4l2_m2m_job_finish(struct v4l2_m2m
 	 * holding capture buffers. Those should use
 	 * v4l2_m2m_buf_done_and_job_finish() instead.
 	 */
-	WARN_ON(m2m_ctx->out_q_ctx.q.subsystem_flags &
-		VB2_V4L2_FL_SUPPORTS_M2M_HOLD_CAPTURE_BUF);
 	spin_lock_irqsave(&m2m_dev->job_spinlock, flags);
 	schedule_next = _v4l2_m2m_job_finish(m2m_dev, m2m_ctx);
 	spin_unlock_irqrestore(&m2m_dev->job_spinlock, flags);
--- a/drivers/staging/media/rpivid/rpivid.c
+++ b/drivers/staging/media/rpivid/rpivid.c
@@ -79,17 +79,24 @@ static const struct rpivid_control rpivi
 
 #define rpivid_ctrls_COUNT	ARRAY_SIZE(rpivid_ctrls)
 
-void *rpivid_find_control_data(struct rpivid_ctx *ctx, u32 id)
+struct v4l2_ctrl *rpivid_find_ctrl(struct rpivid_ctx *ctx, u32 id)
 {
 	unsigned int i;
 
 	for (i = 0; ctx->ctrls[i]; i++)
 		if (ctx->ctrls[i]->id == id)
-			return ctx->ctrls[i]->p_cur.p;
+			return ctx->ctrls[i];
 
 	return NULL;
 }
 
+void *rpivid_find_control_data(struct rpivid_ctx *ctx, u32 id)
+{
+	struct v4l2_ctrl *const ctrl = rpivid_find_ctrl(ctx, id);
+
+	return !ctrl ? NULL : ctrl->p_cur.p;
+}
+
 static int rpivid_init_ctrls(struct rpivid_dev *dev, struct rpivid_ctx *ctx)
 {
 	struct v4l2_ctrl_handler *hdl = &ctx->hdl;
--- a/drivers/staging/media/rpivid/rpivid.h
+++ b/drivers/staging/media/rpivid/rpivid.h
@@ -24,6 +24,10 @@
 
 #define OPT_DEBUG_POLL_IRQ  0
 
+#define RPIVID_DEC_ENV_COUNT 6
+#define RPIVID_P1BUF_COUNT 3
+#define RPIVID_P2BUF_COUNT 3
+
 #define RPIVID_NAME			"rpivid"
 
 #define RPIVID_CAPABILITY_UNTILED	BIT(0)
@@ -45,6 +49,7 @@ struct rpivid_control {
 };
 
 struct rpivid_h265_run {
+	u32 slice_ents;
 	const struct v4l2_ctrl_hevc_sps			*sps;
 	const struct v4l2_ctrl_hevc_pps			*pps;
 	const struct v4l2_ctrl_hevc_slice_params	*slice_params;
@@ -64,7 +69,6 @@ struct rpivid_buffer {
 
 struct rpivid_dec_state;
 struct rpivid_dec_env;
-#define RPIVID_DEC_ENV_COUNT 3
 
 struct rpivid_gptr {
 	size_t size;
@@ -79,7 +83,6 @@ typedef void (*rpivid_irq_callback)(stru
 struct rpivid_q_aux;
 #define RPIVID_AUX_ENT_COUNT VB2_MAX_FRAME
 
-#define RPIVID_P2BUF_COUNT 2
 
 struct rpivid_ctx {
 	struct v4l2_fh			fh;
@@ -108,11 +111,13 @@ struct rpivid_ctx {
 
 	struct rpivid_dec_env *dec_pool;
 
-	/* Some of these should be in dev */
-	struct rpivid_gptr bitbufs[1];  /* Will be 2 */
-	struct rpivid_gptr cmdbufs[1];  /* Will be 2 */
+	unsigned int p1idx;
+	atomic_t p1out;
+	struct rpivid_gptr bitbufs[RPIVID_P1BUF_COUNT];
+	struct rpivid_gptr cmdbufs[RPIVID_P1BUF_COUNT];
+
+	/* *** Should be in dev *** */
 	unsigned int p2idx;
-	atomic_t p2out;
 	struct rpivid_gptr pu_bufs[RPIVID_P2BUF_COUNT];
 	struct rpivid_gptr coeff_bufs[RPIVID_P2BUF_COUNT];
 
@@ -141,6 +146,8 @@ struct rpivid_variant {
 
 struct rpivid_hw_irq_ent;
 
+#define RPIVID_ICTL_ENABLE_UNLIMITED (-1)
+
 struct rpivid_hw_irq_ctrl {
 	/* Spinlock protecting claim and tail */
 	spinlock_t lock;
@@ -182,6 +189,7 @@ struct rpivid_dev {
 
 extern struct rpivid_dec_ops rpivid_dec_ops_h265;
 
+struct v4l2_ctrl *rpivid_find_ctrl(struct rpivid_ctx *ctx, u32 id);
 void *rpivid_find_control_data(struct rpivid_ctx *ctx, u32 id);
 
 #endif
--- a/drivers/staging/media/rpivid/rpivid_dec.c
+++ b/drivers/staging/media/rpivid/rpivid_dec.c
@@ -21,8 +21,8 @@
 
 void rpivid_device_run(void *priv)
 {
-	struct rpivid_ctx *ctx = priv;
-	struct rpivid_dev *dev = ctx->dev;
+	struct rpivid_ctx *const ctx = priv;
+	struct rpivid_dev *const dev = ctx->dev;
 	struct rpivid_run run = {};
 	struct media_request *src_req;
 
@@ -32,19 +32,17 @@ void rpivid_device_run(void *priv)
 	if (!run.src || !run.dst) {
 		v4l2_err(&dev->v4l2_dev, "%s: Missing buffer: src=%p, dst=%p\n",
 			 __func__, run.src, run.dst);
-		/* We are stuffed - this probably won't dig us out of our
-		 * current situation but it is better than nothing
-		 */
-		v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
-						 VB2_BUF_STATE_ERROR);
-		return;
+		goto fail;
 	}
 
-	/* Apply request(s) controls if needed. */
+	/* Apply request(s) controls */
 	src_req = run.src->vb2_buf.req_obj.req;
+	if (!src_req) {
+		v4l2_err(&dev->v4l2_dev, "%s: Missing request\n", __func__);
+		goto fail;
+	}
 
-	if (src_req)
-		v4l2_ctrl_request_setup(src_req, &ctx->hdl);
+	v4l2_ctrl_request_setup(src_req, &ctx->hdl);
 
 	switch (ctx->src_fmt.pixelformat) {
 	case V4L2_PIX_FMT_HEVC_SLICE:
@@ -70,10 +68,14 @@ void rpivid_device_run(void *priv)
 
 	dev->dec_ops->setup(ctx, &run);
 
-	/* Complete request(s) controls if needed. */
-
-	if (src_req)
-		v4l2_ctrl_request_complete(src_req, &ctx->hdl);
+	/* Complete request(s) controls */
+	v4l2_ctrl_request_complete(src_req, &ctx->hdl);
 
 	dev->dec_ops->trigger(ctx);
+	return;
+
+fail:
+	/* We really shouldn't get here but tidy up what we can */
+	v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
+					 VB2_BUF_STATE_ERROR);
 }
--- a/drivers/staging/media/rpivid/rpivid_h265.c
+++ b/drivers/staging/media/rpivid/rpivid_h265.c
@@ -22,6 +22,8 @@
 #define DEBUG_TRACE_P1_CMD 0
 #define DEBUG_TRACE_EXECUTION 0
 
+#define USE_REQUEST_PIN 1
+
 #if DEBUG_TRACE_EXECUTION
 #define xtrace_in(dev_, de_)\
 	v4l2_info(&(dev_)->v4l2_dev, "%s[%d]: in\n",   __func__,\
@@ -192,8 +194,6 @@ struct rpivid_dec_env {
 	unsigned int decode_order;
 	int p1_status;		/* P1 status - what to realloc */
 
-	struct rpivid_dec_env *phase_wait_q_next;
-
 	struct rpi_cmd *cmd_fifo;
 	unsigned int cmd_len, cmd_max;
 	unsigned int num_slice_msgs;
@@ -219,6 +219,7 @@ struct rpivid_dec_env {
 	u32 rpi_currpoc;
 
 	struct vb2_v4l2_buffer *frame_buf; // Detached dest buffer
+	struct vb2_v4l2_buffer *src_buf;   // Detached src buffer
 	unsigned int frame_c_offset;
 	unsigned int frame_stride;
 	dma_addr_t frame_addr;
@@ -235,9 +236,15 @@ struct rpivid_dec_env {
 	size_t bit_copy_len;
 	struct rpivid_gptr *cmd_copy_gptr;
 
-	u16 slice_msgs[2 * HEVC_MAX_REFS * 8 + 3];
+#define SLICE_MSGS_MAX (2 * HEVC_MAX_REFS * 8 + 3)
+	u16 slice_msgs[SLICE_MSGS_MAX];
 	u8 scaling_factors[NUM_SCALING_FACTORS];
 
+#if USE_REQUEST_PIN
+	struct media_request *req_pin;
+#else
+	struct media_request_object *req_obj;
+#endif
 	struct rpivid_hw_irq_ent irq_ent;
 };
 
@@ -286,6 +293,17 @@ struct rpivid_dec_state {
 	unsigned int prev_ctb_y;
 };
 
+#if !USE_REQUEST_PIN
+static void dst_req_obj_release(struct media_request_object *object)
+{
+	kfree(object);
+}
+
+static const struct media_request_object_ops dst_req_obj_ops = {
+	.release = dst_req_obj_release,
+};
+#endif
+
 static inline int clip_int(const int x, const int lo, const int hi)
 {
 	return x < lo ? lo : x > hi ? hi : x;
@@ -298,15 +316,48 @@ static inline int clip_int(const int x,
 static int p1_z;
 #endif
 
+static int cmds_check_space(struct rpivid_dec_env *const de, unsigned int n)
+{
+	struct rpi_cmd *a;
+	unsigned int newmax;
+
+	if (n > 0x100000) {
+		v4l2_err(&de->ctx->dev->v4l2_dev,
+			 "%s: n %u implausible\n", __func__, n);
+		return -ENOMEM;
+	}
+
+	if (de->cmd_len + n <= de->cmd_max)
+		return 0;
+
+	newmax = 2 << log2_size(de->cmd_len + n);
+
+	a = krealloc(de->cmd_fifo, newmax * sizeof(struct rpi_cmd),
+		     GFP_KERNEL);
+	if (!a) {
+		v4l2_err(&de->ctx->dev->v4l2_dev,
+			 "Failed cmd buffer realloc from %u to %u\n",
+			 de->cmd_max, newmax);
+		return -ENOMEM;
+	}
+	v4l2_info(&de->ctx->dev->v4l2_dev,
+		  "cmd buffer realloc from %u to %u\n", de->cmd_max, newmax);
+
+	de->cmd_fifo = a;
+	de->cmd_max = newmax;
+	return 0;
+}
+
 // ???? u16 addr - put in u32
-static int p1_apb_write(struct rpivid_dec_env *const de, const u16 addr,
-			const u32 data)
+static void p1_apb_write(struct rpivid_dec_env *const de, const u16 addr,
+			 const u32 data)
 {
-	if (de->cmd_len == de->cmd_max)
-		de->cmd_fifo =
-			krealloc(de->cmd_fifo,
-				 (de->cmd_max *= 2) * sizeof(struct rpi_cmd),
-				 GFP_KERNEL);
+	if (de->cmd_len >= de->cmd_max) {
+		v4l2_err(&de->ctx->dev->v4l2_dev,
+			 "%s: Overflow @ %d\n", __func__, de->cmd_len);
+		return;
+	}
+
 	de->cmd_fifo[de->cmd_len].addr = addr;
 	de->cmd_fifo[de->cmd_len].data = data;
 
@@ -316,8 +367,7 @@ static int p1_apb_write(struct rpivid_de
 			  de->cmd_len, addr, data);
 	}
 #endif
-
-	return de->cmd_len++;
+	de->cmd_len++;
 }
 
 static int ctb_to_tile(unsigned int ctb, unsigned int *bd, int num)
@@ -511,6 +561,7 @@ static const u8 prob_init[3][156] = {
 	},
 };
 
+#define CMDS_WRITE_PROB ((RPI_PROB_ARRAY_SIZE / 4) + 1)
 static void write_prob(struct rpivid_dec_env *const de,
 		       const struct rpivid_dec_state *const s)
 {
@@ -554,6 +605,7 @@ static void write_prob(struct rpivid_dec
 	p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
 }
 
+#define CMDS_WRITE_SCALING_FACTORS NUM_SCALING_FACTORS
 static void write_scaling_factors(struct rpivid_dec_env *const de)
 {
 	int i;
@@ -569,8 +621,9 @@ static inline __u32 dma_to_axi_addr(dma_
 	return (__u32)(a >> 6);
 }
 
-static void write_bitstream(struct rpivid_dec_env *const de,
-			    const struct rpivid_dec_state *const s)
+#define CMDS_WRITE_BITSTREAM 4
+static int write_bitstream(struct rpivid_dec_env *const de,
+			   const struct rpivid_dec_state *const s)
 {
 	// Note that FFmpeg V4L2 does not remove emulation prevention bytes,
 	// so this is matched in the configuration here.
@@ -584,6 +637,13 @@ static void write_bitstream(struct rpivi
 	if (s->src_addr != 0) {
 		addr = s->src_addr + offset;
 	} else {
+		if (len + de->bit_copy_len > de->bit_copy_gptr->size) {
+			v4l2_warn(&de->ctx->dev->v4l2_dev,
+				  "Bit copy buffer overflow: size=%zu, offset=%zu, len=%u\n",
+				  de->bit_copy_gptr->size,
+				  de->bit_copy_len, len);
+			return -ENOMEM;
+		}
 		memcpy(de->bit_copy_gptr->ptr + de->bit_copy_len,
 		       s->src_buf + offset, len);
 		addr = de->bit_copy_gptr->addr + de->bit_copy_len;
@@ -595,6 +655,7 @@ static void write_bitstream(struct rpivi
 	p1_apb_write(de, RPI_BFNUM, len);
 	p1_apb_write(de, RPI_BFCONTROL, offset + (1 << 7)); // Stop
 	p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu << 6));
+	return 0;
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -623,6 +684,7 @@ static u32 slice_reg_const(const struct
 
 //////////////////////////////////////////////////////////////////////////////
 
+#define CMDS_NEW_SLICE_SEGMENT (4 + CMDS_WRITE_SCALING_FACTORS)
 static void new_slice_segment(struct rpivid_dec_env *const de,
 			      const struct rpivid_dec_state *const s)
 {
@@ -706,6 +768,7 @@ static void msg_slice(struct rpivid_dec_
 	de->slice_msgs[de->num_slice_msgs++] = msg;
 }
 
+#define CMDS_PROGRAM_SLICECMDS (1 + SLICE_MSGS_MAX)
 static void program_slicecmds(struct rpivid_dec_env *const de,
 			      const int sliceid)
 {
@@ -902,6 +965,7 @@ static void pre_slice_decode(struct rpiv
 		       (sh->slice_cb_qp_offset & 31)); // CMD_QPOFF
 }
 
+#define CMDS_WRITE_SLICE 1
 static void write_slice(struct rpivid_dec_env *const de,
 			const struct rpivid_dec_state *const s,
 			const u32 slice_const,
@@ -927,6 +991,7 @@ static void write_slice(struct rpivid_de
  * N.B. This can be called to fill in data from the previous slice so must not
  * use any state data that may change from slice to slice (e.g. qp)
  */
+#define CMDS_NEW_ENTRY_POINT (6 + CMDS_WRITE_SLICE)
 static void new_entry_point(struct rpivid_dec_env *const de,
 			    const struct rpivid_dec_state *const s,
 			    const bool do_bte,
@@ -977,6 +1042,7 @@ static void new_entry_point(struct rpivi
 //////////////////////////////////////////////////////////////////////////////
 // Wavefront mode
 
+#define CMDS_WPP_PAUSE 4
 static void wpp_pause(struct rpivid_dec_env *const de, int ctb_row)
 {
 	p1_apb_write(de, RPI_STATUS, (ctb_row << 18) | 0x25);
@@ -987,12 +1053,19 @@ static void wpp_pause(struct rpivid_dec_
 	p1_apb_write(de, RPI_CONTROL, (ctb_row << 16) + 2);
 }
 
-static void wpp_entry_fill(struct rpivid_dec_env *const de,
-			   const struct rpivid_dec_state *const s,
-			   const unsigned int last_y)
+#define CMDS_WPP_ENTRY_FILL_1 (CMDS_WPP_PAUSE + 2 + CMDS_NEW_ENTRY_POINT)
+static int wpp_entry_fill(struct rpivid_dec_env *const de,
+			  const struct rpivid_dec_state *const s,
+			  const unsigned int last_y)
 {
+	int rv;
 	const unsigned int last_x = s->ctb_width - 1;
 
+	rv = cmds_check_space(de, CMDS_WPP_ENTRY_FILL_1 *
+				  (last_y - de->entry_ctb_y));
+	if (rv)
+		return rv;
+
 	while (de->entry_ctb_y < last_y) {
 		/* wpp_entry_x/y set by wpp_entry_point */
 		if (s->ctb_width > 2)
@@ -1010,12 +1083,21 @@ static void wpp_entry_fill(struct rpivid
 				0, 0, 0, de->entry_ctb_y + 1,
 				de->entry_qp, de->entry_slice);
 	}
+	return 0;
 }
 
-static void wpp_end_previous_slice(struct rpivid_dec_env *const de,
-				   const struct rpivid_dec_state *const s)
+static int wpp_end_previous_slice(struct rpivid_dec_env *const de,
+				  const struct rpivid_dec_state *const s)
 {
-	wpp_entry_fill(de, s, s->prev_ctb_y);
+	int rv;
+
+	rv = wpp_entry_fill(de, s, s->prev_ctb_y);
+	if (rv)
+		return rv;
+
+	rv = cmds_check_space(de, CMDS_WPP_PAUSE + 2);
+	if (rv)
+		return rv;
 
 	if (de->entry_ctb_x < 2 &&
 	    (de->entry_ctb_y < s->start_ctb_y || s->start_ctb_x > 2) &&
@@ -1026,21 +1108,38 @@ static void wpp_end_previous_slice(struc
 	if (s->start_ctb_x == 2 ||
 	    (s->ctb_width == 2 && de->entry_ctb_y < s->start_ctb_y))
 		p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
+	return 0;
 }
 
 /* Only main profile supported so WPP => !Tiles which makes some of the
  * next chunk code simpler
  */
-static void wpp_decode_slice(struct rpivid_dec_env *const de,
-			     const struct rpivid_dec_state *const s)
+static int wpp_decode_slice(struct rpivid_dec_env *const de,
+			    const struct rpivid_dec_state *const s)
 {
 	bool reset_qp_y = true;
 	const bool indep = !s->dependent_slice_segment_flag;
+	int rv;
 
-	if (s->start_ts)
-		wpp_end_previous_slice(de, s);
+	if (s->start_ts) {
+		rv = wpp_end_previous_slice(de, s);
+		if (rv)
+			return rv;
+	}
 	pre_slice_decode(de, s);
-	write_bitstream(de, s);
+
+	rv = cmds_check_space(de,
+			      CMDS_WRITE_BITSTREAM +
+				CMDS_WRITE_PROB +
+				CMDS_PROGRAM_SLICECMDS +
+				CMDS_NEW_SLICE_SEGMENT +
+				CMDS_NEW_ENTRY_POINT);
+	if (rv)
+		return rv;
+
+	rv = write_bitstream(de, s);
+	if (rv)
+		return rv;
 
 	if (!s->start_ts || indep || s->ctb_width == 1)
 		write_prob(de, s);
@@ -1056,7 +1155,13 @@ static void wpp_decode_slice(struct rpiv
 			s->slice_qp, slice_reg_const(s));
 
 	if (s->frame_end) {
-		wpp_entry_fill(de, s, s->ctb_height - 1);
+		rv = wpp_entry_fill(de, s, s->ctb_height - 1);
+		if (rv)
+			return rv;
+
+		rv = cmds_check_space(de, CMDS_WPP_PAUSE + 1);
+		if (rv)
+			return rv;
 
 		if (de->entry_ctb_x < 2 && s->ctb_width > 2)
 			wpp_pause(de, s->ctb_height - 1);
@@ -1065,25 +1170,32 @@ static void wpp_decode_slice(struct rpiv
 			     1 | ((s->ctb_width - 1) << 5) |
 				((s->ctb_height - 1) << 18));
 	}
-
+	return 0;
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Tiles mode
 
-static void tile_entry_fill(struct rpivid_dec_env *const de,
-			    const struct rpivid_dec_state *const s,
-			    const unsigned int last_tile_x,
-			    const unsigned int last_tile_y)
+// Guarantees 1 cmd entry free on exit
+static int tile_entry_fill(struct rpivid_dec_env *const de,
+			   const struct rpivid_dec_state *const s,
+			   const unsigned int last_tile_x,
+			   const unsigned int last_tile_y)
 {
 	while (de->entry_tile_y < last_tile_y ||
 	       (de->entry_tile_y == last_tile_y &&
 		de->entry_tile_x < last_tile_x)) {
+		int rv;
 		unsigned int t_x = de->entry_tile_x;
 		unsigned int t_y = de->entry_tile_y;
 		const unsigned int last_x = s->col_bd[t_x + 1] - 1;
 		const unsigned int last_y = s->row_bd[t_y + 1] - 1;
 
+		// One more than needed here
+		rv = cmds_check_space(de, CMDS_NEW_ENTRY_POINT + 3);
+		if (rv)
+			return rv;
+
 		p1_apb_write(de, RPI_STATUS,
 			     2 | (last_x << 5) | (last_y << 18));
 		p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
@@ -1098,33 +1210,55 @@ static void tile_entry_fill(struct rpivi
 				t_x, t_y, s->col_bd[t_x], s->row_bd[t_y],
 				de->entry_qp, de->entry_slice);
 	}
+	return 0;
 }
 
 /*
  * Write STATUS register with expected end CTU address of previous slice
  */
-static void end_previous_slice(struct rpivid_dec_env *const de,
-			       const struct rpivid_dec_state *const s)
+static int end_previous_slice(struct rpivid_dec_env *const de,
+			      const struct rpivid_dec_state *const s)
 {
-	tile_entry_fill(de, s,
-			ctb_to_tile_x(s, s->prev_ctb_x),
-			ctb_to_tile_y(s, s->prev_ctb_y));
+	int rv;
+
+	rv = tile_entry_fill(de, s,
+			     ctb_to_tile_x(s, s->prev_ctb_x),
+			     ctb_to_tile_y(s, s->prev_ctb_y));
+	if (rv)
+		return rv;
+
 	p1_apb_write(de, RPI_STATUS,
 		     1 | (s->prev_ctb_x << 5) | (s->prev_ctb_y << 18));
+	return 0;
 }
 
-static void decode_slice(struct rpivid_dec_env *const de,
-			 const struct rpivid_dec_state *const s)
+static int decode_slice(struct rpivid_dec_env *const de,
+			const struct rpivid_dec_state *const s)
 {
 	bool reset_qp_y;
 	unsigned int tile_x = ctb_to_tile_x(s, s->start_ctb_x);
 	unsigned int tile_y = ctb_to_tile_y(s, s->start_ctb_y);
+	int rv;
 
-	if (s->start_ts)
-		end_previous_slice(de, s);
+	if (s->start_ts) {
+		rv = end_previous_slice(de, s);
+		if (rv)
+			return rv;
+	}
+
+	rv = cmds_check_space(de,
+			      CMDS_WRITE_BITSTREAM +
+				CMDS_WRITE_PROB +
+				CMDS_PROGRAM_SLICECMDS +
+				CMDS_NEW_SLICE_SEGMENT +
+				CMDS_NEW_ENTRY_POINT);
+	if (rv)
+		return rv;
 
 	pre_slice_decode(de, s);
-	write_bitstream(de, s);
+	rv = write_bitstream(de, s);
+	if (rv)
+		return rv;
 
 	reset_qp_y = !s->start_ts ||
 		!s->dependent_slice_segment_flag ||
@@ -1146,13 +1280,16 @@ static void decode_slice(struct rpivid_d
 	 * when it will be known where this slice finishes
 	 */
 	if (s->frame_end) {
-		tile_entry_fill(de, s,
-				s->tile_width - 1,
-				s->tile_height - 1);
+		rv = tile_entry_fill(de, s,
+				     s->tile_width - 1,
+				     s->tile_height - 1);
+		if (rv)
+			return rv;
 		p1_apb_write(de, RPI_STATUS,
 			     1 | ((s->ctb_width - 1) << 5) |
 				((s->ctb_height - 1) << 18));
 	}
+	return 0;
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1524,7 +1661,7 @@ static void rpivid_h265_setup(struct rpi
 	struct rpivid_dev *const dev = ctx->dev;
 	const struct v4l2_ctrl_hevc_slice_params *const sh =
 						run->h265.slice_params;
-	const struct v4l2_hevc_pred_weight_table *pred_weight_table;
+//	const struct v4l2_hevc_pred_weight_table *pred_weight_table;
 	struct rpivid_q_aux *dpb_q_aux[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
 	struct rpivid_dec_state *const s = ctx->state;
 	struct vb2_queue *vq;
@@ -1532,11 +1669,12 @@ static void rpivid_h265_setup(struct rpi
 	unsigned int prev_rs;
 	unsigned int i;
 	int use_aux;
+	int rv;
 	bool slice_temporal_mvp;
 
 	xtrace_in(dev, de);
 
-	pred_weight_table = &sh->pred_weight_table;
+//	pred_weight_table = &sh->pred_weight_table;
 
 	s->frame_end =
 		((run->src->flags & V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF) == 0);
@@ -1608,9 +1746,9 @@ static void rpivid_h265_setup(struct rpi
 		de->cmd_len = 0;
 		de->dpbno_col = ~0U;
 
-		de->bit_copy_gptr = ctx->bitbufs + 0;
+		de->bit_copy_gptr = ctx->bitbufs + ctx->p1idx;
 		de->bit_copy_len = 0;
-		de->cmd_copy_gptr = ctx->cmdbufs + 0;
+		de->cmd_copy_gptr = ctx->cmdbufs + ctx->p1idx;
 
 		de->frame_c_offset = ctx->dst_fmt.height * 128;
 		de->frame_stride = ctx->dst_fmt.plane_fmt[0].bytesperline * 128;
@@ -1727,6 +1865,9 @@ static void rpivid_h265_setup(struct rpi
 			bits_alloc = wxh < 983040 ? wxh * 3 / 4 :
 				wxh < 983040 * 2 ? 983040 * 3 / 4 :
 				wxh * 3 / 8;
+			/* Allow for bit depth */
+			bits_alloc += (bits_alloc *
+				       s->sps.bit_depth_luma_minus8) / 8;
 			bits_alloc = round_up_size(bits_alloc);
 
 			if (gptr_alloc(dev, de->bit_copy_gptr,
@@ -1743,18 +1884,35 @@ static void rpivid_h265_setup(struct rpi
 		}
 	}
 
-	// Pre calc a few things
-	s->src_addr =
-		!s->frame_end ?
-			0 :
-			vb2_dma_contig_plane_dma_addr(&run->src->vb2_buf, 0);
-	s->src_buf = s->src_addr != 0 ? NULL :
-					vb2_plane_vaddr(&run->src->vb2_buf, 0);
+	// Either map src buffer or use directly
+	s->src_addr = 0;
+	s->src_buf = NULL;
+
+	if (run->src->planes[0].bytesused < (sh->bit_size + 7) / 8) {
+		v4l2_warn(&dev->v4l2_dev,
+			  "Bit size %d > bytesused %d\n",
+			  sh->bit_size, run->src->planes[0].bytesused);
+		goto fail;
+	}
+	if (sh->data_bit_offset >= sh->bit_size ||
+	    sh->bit_size - sh->data_bit_offset < 8) {
+		v4l2_warn(&dev->v4l2_dev,
+			  "Bit size %d < Bit offset %d + 8\n",
+			  sh->bit_size, sh->data_bit_offset);
+		goto fail;
+	}
+
+	if (s->frame_end)
+		s->src_addr = vb2_dma_contig_plane_dma_addr(&run->src->vb2_buf,
+							    0);
+	if (!s->src_addr)
+		s->src_buf = vb2_plane_vaddr(&run->src->vb2_buf, 0);
 	if (!s->src_addr && !s->src_buf) {
 		v4l2_err(&dev->v4l2_dev, "Failed to map src buffer\n");
 		goto fail;
 	}
 
+	// Pre calc a few things
 	s->sh = sh;
 	s->slice_qp = 26 + s->pps.init_qp_minus26 + s->sh->slice_qp_delta;
 	s->max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ?
@@ -1785,9 +1943,11 @@ static void rpivid_h265_setup(struct rpi
 	s->prev_ctb_y = prev_rs / de->pic_width_in_ctbs_y;
 
 	if ((s->pps.flags & V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED))
-		wpp_decode_slice(de, s);
+		rv = wpp_decode_slice(de, s);
 	else
-		decode_slice(de, s);
+		rv = decode_slice(de, s);
+	if (rv)
+		goto fail;
 
 	if (!s->frame_end) {
 		xtrace_ok(dev, de);
@@ -1945,29 +2105,28 @@ static int check_status(const struct rpi
 	return -1;
 }
 
-static void cb_phase2(struct rpivid_dev *const dev, void *v)
+static void phase2_cb(struct rpivid_dev *const dev, void *v)
 {
 	struct rpivid_dec_env *const de = v;
-	struct rpivid_ctx *const ctx = de->ctx;
 
 	xtrace_in(dev, de);
 
-	v4l2_m2m_cap_buf_return(dev->m2m_dev, ctx->fh.m2m_ctx, de->frame_buf,
-				VB2_BUF_STATE_DONE);
-	de->frame_buf = NULL;
+	/* Done with buffers - allow new P1 */
+	rpivid_hw_irq_active1_enable_claim(dev, 1);
 
-	/* Delete de before finish as finish might immediately trigger a reuse
-	 * of de
-	 */
-	dec_env_delete(de);
+	v4l2_m2m_buf_done(de->frame_buf, VB2_BUF_STATE_DONE);
+	de->frame_buf = NULL;
 
-	if (atomic_add_return(-1, &ctx->p2out) >= RPIVID_P2BUF_COUNT - 1) {
-		xtrace_fin(dev, de);
-		v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
-						 VB2_BUF_STATE_DONE);
-	}
+#if USE_REQUEST_PIN
+	media_request_unpin(de->req_pin);
+	de->req_pin = NULL;
+#else
+	media_request_object_complete(de->req_obj);
+	de->req_obj = NULL;
+#endif
 
 	xtrace_ok(dev, de);
+	dec_env_delete(de);
 }
 
 static void phase2_claimed(struct rpivid_dev *const dev, void *v)
@@ -2023,7 +2182,7 @@ static void phase2_claimed(struct rpivid
 	//	   de->ctx->colmvbuf.addr, de->ctx->colmvbuf.addr +
 	//	   de->ctx->colmvbuf.size);
 
-	rpivid_hw_irq_active2_irq(dev, &de->irq_ent, cb_phase2, de);
+	rpivid_hw_irq_active2_irq(dev, &de->irq_ent, phase2_cb, de);
 
 	apb_write_final(dev, RPI_NUMROWS, de->pic_height_in_ctbs_y);
 
@@ -2032,6 +2191,39 @@ static void phase2_claimed(struct rpivid
 
 static void phase1_claimed(struct rpivid_dev *const dev, void *v);
 
+// release any and all objects associated with de
+// and reenable phase 1 if required
+static void phase1_err_fin(struct rpivid_dev *const dev,
+			   struct rpivid_ctx *const ctx,
+			   struct rpivid_dec_env *const de)
+{
+	/* Return all detached buffers */
+	if (de->src_buf)
+		v4l2_m2m_buf_done(de->src_buf, VB2_BUF_STATE_ERROR);
+	de->src_buf = NULL;
+	if (de->frame_buf)
+		v4l2_m2m_buf_done(de->frame_buf, VB2_BUF_STATE_ERROR);
+	de->frame_buf = NULL;
+#if USE_REQUEST_PIN
+	if (de->req_pin)
+		media_request_unpin(de->req_pin);
+	de->req_pin = NULL;
+#else
+	if (de->req_obj)
+		media_request_object_complete(de->req_obj);
+	de->req_obj = NULL;
+#endif
+
+	dec_env_delete(de);
+
+	/* Reenable phase 0 if we were blocking */
+	if (atomic_add_return(-1, &ctx->p1out) >= RPIVID_P1BUF_COUNT - 1)
+		v4l2_m2m_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx);
+
+	/* Done with P1-P2 buffers - allow new P1 */
+	rpivid_hw_irq_active1_enable_claim(dev, 1);
+}
+
 static void phase1_thread(struct rpivid_dev *const dev, void *v)
 {
 	struct rpivid_dec_env *const de = v;
@@ -2076,15 +2268,12 @@ fail:
 			 __func__);
 		ctx->fatal_err = 1;
 	}
-	dec_env_delete(de);
-	xtrace_fin(dev, de);
-	v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
-					 VB2_BUF_STATE_ERROR);
 	xtrace_fail(dev, de);
+	phase1_err_fin(dev, ctx, de);
 }
 
 /* Always called in irq context (this is good) */
-static void cb_phase1(struct rpivid_dev *const dev, void *v)
+static void phase1_cb(struct rpivid_dev *const dev, void *v)
 {
 	struct rpivid_dec_env *const de = v;
 	struct rpivid_ctx *const ctx = de->ctx;
@@ -2092,6 +2281,7 @@ static void cb_phase1(struct rpivid_dev
 	xtrace_in(dev, de);
 
 	de->p1_status = check_status(dev);
+
 	if (de->p1_status != 0) {
 		v4l2_info(&dev->v4l2_dev, "%s: Post wait: %#x\n",
 			  __func__, de->p1_status);
@@ -2105,24 +2295,17 @@ static void cb_phase1(struct rpivid_dev
 		return;
 	}
 
-	/* After the frame-buf is detached it must be returned but from
-	 * this point onward (phase2_claimed, cb_phase2) there are no error
-	 * paths so the return at the end of cb_phase2 is all that is needed
-	 */
-	de->frame_buf = v4l2_m2m_cap_buf_detach(dev->m2m_dev, ctx->fh.m2m_ctx);
-	if (!de->frame_buf) {
-		v4l2_err(&dev->v4l2_dev, "%s: No detached buffer\n", __func__);
-		goto fail;
-	}
+	v4l2_m2m_buf_done(de->src_buf, VB2_BUF_STATE_DONE);
+	de->src_buf = NULL;
 
+	/* All phase1 error paths done - it is safe to inc p2idx */
 	ctx->p2idx =
 		(ctx->p2idx + 1 >= RPIVID_P2BUF_COUNT) ? 0 : ctx->p2idx + 1;
 
-	// Enable the next setup if our Q isn't too big
-	if (atomic_add_return(1, &ctx->p2out) < RPIVID_P2BUF_COUNT) {
+	/* Renable the next setup if we were blocking */
+	if (atomic_add_return(-1, &ctx->p1out) >= RPIVID_P1BUF_COUNT - 1) {
 		xtrace_fin(dev, de);
-		v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
-						 VB2_BUF_STATE_DONE);
+		v4l2_m2m_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx);
 	}
 
 	rpivid_hw_irq_active2_claim(dev, &de->irq_ent, phase2_claimed, de);
@@ -2131,11 +2314,8 @@ static void cb_phase1(struct rpivid_dev
 	return;
 
 fail:
-	dec_env_delete(de);
-	xtrace_fin(dev, de);
-	v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
-					 VB2_BUF_STATE_ERROR);
 	xtrace_fail(dev, de);
+	phase1_err_fin(dev, ctx, de);
 }
 
 static void phase1_claimed(struct rpivid_dev *const dev, void *v)
@@ -2160,6 +2340,10 @@ static void phase1_claimed(struct rpivid
 	de->coeff_stride =
 		ALIGN_DOWN(coeff_gptr->size / de->pic_height_in_ctbs_y, 64);
 
+	/* phase1_claimed blocked until cb_phase1 completed so p2idx inc
+	 * in cb_phase1 after error detection
+	 */
+
 	apb_write_vc_addr(dev, RPI_PUWBASE, de->pu_base_vc);
 	apb_write_vc_len(dev, RPI_PUWSTRIDE, de->pu_stride);
 	apb_write_vc_addr(dev, RPI_COEFFWBASE, de->coeff_base_vc);
@@ -2169,7 +2353,7 @@ static void phase1_claimed(struct rpivid
 	apb_write(dev, RPI_CFNUM, de->cmd_len);
 
 	// Claim irq
-	rpivid_hw_irq_active1_irq(dev, &de->irq_ent, cb_phase1, de);
+	rpivid_hw_irq_active1_irq(dev, &de->irq_ent, phase1_cb, de);
 
 	// And start the h/w
 	apb_write_vc_addr_final(dev, RPI_CFBASE, de->cmd_copy_gptr->addr);
@@ -2178,11 +2362,8 @@ static void phase1_claimed(struct rpivid
 	return;
 
 fail:
-	dec_env_delete(de);
-	xtrace_fin(dev, de);
-	v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
-					 VB2_BUF_STATE_ERROR);
 	xtrace_fail(dev, de);
+	phase1_err_fin(dev, ctx, de);
 }
 
 static void dec_state_delete(struct rpivid_ctx *const ctx)
@@ -2315,7 +2496,9 @@ static void rpivid_h265_trigger(struct r
 	case RPIVID_DECODE_SLICE_CONTINUE:
 		v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
 						 VB2_BUF_STATE_DONE);
+		xtrace_ok(dev, de);
 		break;
+
 	default:
 		v4l2_err(&dev->v4l2_dev, "%s: Unexpected state: %d\n", __func__,
 			 de->state);
@@ -2329,14 +2512,59 @@ static void rpivid_h265_trigger(struct r
 		v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx,
 						 VB2_BUF_STATE_ERROR);
 		break;
+
 	case RPIVID_DECODE_PHASE1:
 		ctx->dec0 = NULL;
+
+#if !USE_REQUEST_PIN
+		/* Alloc a new request object - needs to be alloced dynamically
+		 * as the media request will release it some random time after
+		 * it is completed
+		 */
+		de->req_obj = kmalloc(sizeof(*de->req_obj), GFP_KERNEL);
+		if (!de->req_obj) {
+			xtrace_fail(dev, de);
+			dec_env_delete(de);
+			v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev,
+							 ctx->fh.m2m_ctx,
+							 VB2_BUF_STATE_ERROR);
+			break;
+		}
+		media_request_object_init(de->req_obj);
+#warning probably needs to _get the req obj too
+#endif
+		ctx->p1idx = (ctx->p1idx + 1 >= RPIVID_P1BUF_COUNT) ?
+							0 : ctx->p1idx + 1;
+
+		/* We know we have src & dst so no need to test */
+		de->src_buf = v4l2_m2m_src_buf_remove(ctx->fh.m2m_ctx);
+		de->frame_buf = v4l2_m2m_dst_buf_remove(ctx->fh.m2m_ctx);
+
+#if USE_REQUEST_PIN
+		de->req_pin = de->src_buf->vb2_buf.req_obj.req;
+		media_request_pin(de->req_pin);
+#else
+		media_request_object_bind(de->src_buf->vb2_buf.req_obj.req,
+					  &dst_req_obj_ops, de, false,
+					  de->req_obj);
+#endif
+
+		/* We could get rid of the src buffer here if we've already
+		 * copied it, but we don't copy the last buffer unless it
+		 * didn't return a contig dma addr and that shouldn't happen
+		 */
+
+		/* Enable the next setup if our Q isn't too big */
+		if (atomic_add_return(1, &ctx->p1out) < RPIVID_P1BUF_COUNT) {
+			xtrace_fin(dev, de);
+			v4l2_m2m_job_finish(dev->m2m_dev, ctx->fh.m2m_ctx);
+		}
+
 		rpivid_hw_irq_active1_claim(dev, &de->irq_ent, phase1_claimed,
 					    de);
+		xtrace_ok(dev, de);
 		break;
 	}
-
-	xtrace_ok(dev, de);
 }
 
 struct rpivid_dec_ops rpivid_dec_ops_h265 = {
--- a/drivers/staging/media/rpivid/rpivid_hw.c
+++ b/drivers/staging/media/rpivid/rpivid_hw.c
@@ -185,14 +185,14 @@ static void do_enable_claim(struct rpivi
 	sched_cb(dev, ictl, ient);
 }
 
-static void ictl_init(struct rpivid_hw_irq_ctrl * const ictl)
+static void ictl_init(struct rpivid_hw_irq_ctrl * const ictl, int enables)
 {
 	spin_lock_init(&ictl->lock);
 	ictl->claim = NULL;
 	ictl->tail = NULL;
 	ictl->irq = NULL;
 	ictl->no_sched = 0;
-	ictl->enable = -1;
+	ictl->enable = enables;
 	ictl->thread_reqed = false;
 }
 
@@ -308,8 +308,8 @@ int rpivid_hw_probe(struct rpivid_dev *d
 	int irq_dec;
 	int ret = 0;
 
-	ictl_init(&dev->ic_active1);
-	ictl_init(&dev->ic_active2);
+	ictl_init(&dev->ic_active1, RPIVID_P2BUF_COUNT);
+	ictl_init(&dev->ic_active2, RPIVID_ICTL_ENABLE_UNLIMITED);
 
 	res = platform_get_resource_byname(dev->pdev, IORESOURCE_MEM, "intc");
 	if (!res)