diff options
Diffstat (limited to 'target/linux/bcm27xx/patches-5.15/950-0393-media-rpivid-Remove-the-need-to-have-num_entry_point.patch')
-rw-r--r-- | target/linux/bcm27xx/patches-5.15/950-0393-media-rpivid-Remove-the-need-to-have-num_entry_point.patch | 980 |
1 files changed, 980 insertions, 0 deletions
diff --git a/target/linux/bcm27xx/patches-5.15/950-0393-media-rpivid-Remove-the-need-to-have-num_entry_point.patch b/target/linux/bcm27xx/patches-5.15/950-0393-media-rpivid-Remove-the-need-to-have-num_entry_point.patch new file mode 100644 index 0000000000..aec95636f9 --- /dev/null +++ b/target/linux/bcm27xx/patches-5.15/950-0393-media-rpivid-Remove-the-need-to-have-num_entry_point.patch @@ -0,0 +1,980 @@ +From 4b33f988d9da9776bcfe218df4ab9865f4b771a8 Mon Sep 17 00:00:00 2001 +From: John Cox <jc@kynesim.co.uk> +Date: Thu, 21 May 2020 11:49:37 +0100 +Subject: [PATCH] media: rpivid: Remove the need to have + num_entry_points set + +VAAPI H265 has num entry points but never sets it. Allow a VAAPI +shim to work without requiring rewriting the VAAPI driver. +num_entry_points can be calculated from the slice_segment_addr +of the next slice so delay processing until we have that. + +Also includes some minor cosmetics. + +Signed-off-by: John Cox <jc@kynesim.co.uk> +--- + drivers/staging/media/rpivid/rpivid_h265.c | 699 +++++++++++---------- + 1 file changed, 365 insertions(+), 334 deletions(-) + +--- a/drivers/staging/media/rpivid/rpivid_h265.c ++++ b/drivers/staging/media/rpivid/rpivid_h265.c +@@ -202,8 +202,17 @@ struct rpivid_dec_env { + unsigned int dpbno_col; + u32 reg_slicestart; + int collocated_from_l0_flag; +- unsigned int wpp_entry_x; +- unsigned int wpp_entry_y; ++ /* ++ * Last CTB/Tile X,Y processed by (wpp_)entry_point ++ * Could be in _state as P0 only but needs updating where _state ++ * is const ++ */ ++ unsigned int entry_ctb_x; ++ unsigned int entry_ctb_y; ++ unsigned int entry_tile_x; ++ unsigned int entry_tile_y; ++ unsigned int entry_qp; ++ u32 entry_slice; + + u32 rpi_config2; + u32 rpi_framesize; +@@ -239,22 +248,17 @@ struct rpivid_dec_state { + struct v4l2_ctrl_hevc_pps pps; + + // Helper vars & tables derived from sps/pps +- unsigned int log2_ctb_size; /* log2 width of a CTB */ +- unsigned int ctb_width; /* Width in CTBs */ +- unsigned int ctb_height; /* Height in CTBs */ +- unsigned int ctb_size; /* Pic area in CTBs */ +- unsigned int num_tile_columns; +- unsigned int num_tile_rows; +- u8 column_width[member_size(struct v4l2_ctrl_hevc_pps, +- column_width_minus1)]; +- u8 row_height[member_size(struct v4l2_ctrl_hevc_pps, +- row_height_minus1)]; ++ unsigned int log2_ctb_size; /* log2 width of a CTB */ ++ unsigned int ctb_width; /* Width in CTBs */ ++ unsigned int ctb_height; /* Height in CTBs */ ++ unsigned int ctb_size; /* Pic area in CTBs */ ++ unsigned int tile_width; /* Width in tiles */ ++ unsigned int tile_height; /* Height in tiles */ + + int *col_bd; + int *row_bd; + int *ctb_addr_rs_to_ts; + int *ctb_addr_ts_to_rs; +- int *tile_id; + + // Aux starage for DPB + // Hold refs +@@ -274,6 +278,12 @@ struct rpivid_dec_state { + unsigned int slice_qp; + unsigned int max_num_merge_cand; // 0 if I-slice + bool dependent_slice_segment_flag; ++ ++ unsigned int start_ts; /* slice_segment_addr -> ts */ ++ unsigned int start_ctb_x; /* CTB X,Y of start_ts */ ++ unsigned int start_ctb_y; ++ unsigned int prev_ctb_x; /* CTB X,Y of start_ts - 1 */ ++ unsigned int prev_ctb_y; + }; + + static inline int clip_int(const int x, const int lo, const int hi) +@@ -319,15 +329,16 @@ static int ctb_to_tile(unsigned int ctb, + return i - 1; + } + +-static int ctb_to_slice_w_h(unsigned int ctb, int ctb_size, int width, +- unsigned int *bd, int num) ++static unsigned int ctb_to_tile_x(const struct rpivid_dec_state *const s, ++ const unsigned int ctb_x) + { +- if (ctb < bd[num - 1]) +- return ctb_size; +- else if (width % ctb_size) +- return width % ctb_size; +- else +- return ctb_size; ++ return ctb_to_tile(ctb_x, s->col_bd, s->tile_width); ++} ++ ++static unsigned int ctb_to_tile_y(const struct rpivid_dec_state *const s, ++ const unsigned int ctb_y) ++{ ++ return ctb_to_tile(ctb_y, s->row_bd, s->tile_height); + } + + static void aux_q_free(struct rpivid_ctx *const ctx, +@@ -532,6 +543,15 @@ static void write_prob(struct rpivid_dec + p1_apb_write(de, 0x1000 + i, + dst[i] + (dst[i + 1] << 8) + (dst[i + 2] << 16) + + (dst[i + 3] << 24)); ++ ++ /* ++ * Having written the prob array back it up ++ * This is not always needed but is a small overhead that simplifies ++ * (and speeds up) some multi-tile & WPP scenarios ++ * There are no scenarios where having written a prob we ever want ++ * a previous (non-initial) state back ++ */ ++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); + } + + static void write_scaling_factors(struct rpivid_dec_env *const de) +@@ -552,8 +572,8 @@ static inline __u32 dma_to_axi_addr(dma_ + static void write_bitstream(struct rpivid_dec_env *const de, + const struct rpivid_dec_state *const s) + { +- // Note that FFmpeg removes emulation prevention bytes, so this is +- // matched in the configuration here. ++ // Note that FFmpeg V4L2 does not remove emulation prevention bytes, ++ // so this is matched in the configuration here. + // Whether that is the correct behaviour or not is not clear in the + // spec. + const int rpi_use_emu = 1; +@@ -579,78 +599,26 @@ static void write_bitstream(struct rpivi + + ////////////////////////////////////////////////////////////////////////////// + +-static void write_slice(struct rpivid_dec_env *const de, +- const struct rpivid_dec_state *const s, +- const unsigned int slice_w, +- const unsigned int slice_h) +-{ +- u32 u32 = (s->sh->slice_type << 12) + +- (((s->sh->flags & +- V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA) != 0) +- << 14) + +- (((s->sh->flags & +- V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA) != 0) +- << 15) + +- (slice_w << 17) + (slice_h << 24); +- +- u32 |= (s->max_num_merge_cand << 0) + (s->nb_refs[L0] << 4) + +- (s->nb_refs[L1] << 8); +- +- if (s->sh->slice_type == HEVC_SLICE_B) +- u32 |= ((s->sh->flags & +- V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO) != 0) +- << 16; +- p1_apb_write(de, RPI_SLICE, u32); +-} +- +-////////////////////////////////////////////////////////////////////////////// +-// Tiles mode +- +-static void new_entry_point(struct rpivid_dec_env *const de, +- const struct rpivid_dec_state *const s, +- const int do_bte, +- const int reset_qp_y, const int ctb_addr_ts) ++/* ++ * The slice constant part of the slice register - width and height need to ++ * be ORed in later as they are per-tile / WPP-row ++ */ ++static u32 slice_reg_const(const struct rpivid_dec_state *const s) + { +- int ctb_col = s->ctb_addr_ts_to_rs[ctb_addr_ts] % +- de->pic_width_in_ctbs_y; +- int ctb_row = s->ctb_addr_ts_to_rs[ctb_addr_ts] / +- de->pic_width_in_ctbs_y; +- +- int tile_x = ctb_to_tile(ctb_col, s->col_bd, s->num_tile_columns); +- int tile_y = ctb_to_tile(ctb_row, s->row_bd, s->num_tile_rows); +- +- int endx = s->col_bd[tile_x + 1] - 1; +- int endy = s->row_bd[tile_y + 1] - 1; +- +- u8 slice_w = ctb_to_slice_w_h(ctb_col, 1 << s->log2_ctb_size, +- s->sps.pic_width_in_luma_samples, +- s->col_bd, s->num_tile_columns); +- u8 slice_h = ctb_to_slice_w_h(ctb_row, 1 << s->log2_ctb_size, +- s->sps.pic_height_in_luma_samples, +- s->row_bd, s->num_tile_rows); +- +- p1_apb_write(de, RPI_TILESTART, +- s->col_bd[tile_x] + (s->row_bd[tile_y] << 16)); +- p1_apb_write(de, RPI_TILEEND, endx + (endy << 16)); +- +- if (do_bte) +- p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy << 16)); ++ u32 x = (s->max_num_merge_cand << 0) | ++ (s->nb_refs[L0] << 4) | ++ (s->nb_refs[L1] << 8) | ++ (s->sh->slice_type << 12); ++ ++ if (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA) ++ x |= BIT(14); ++ if (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA) ++ x |= BIT(15); ++ if (s->sh->slice_type == HEVC_SLICE_B && ++ (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO)) ++ x |= BIT(16); + +- write_slice(de, s, slice_w, slice_h); +- +- if (reset_qp_y) { +- unsigned int sps_qp_bd_offset = +- 6 * s->sps.bit_depth_luma_minus8; +- +- p1_apb_write(de, RPI_QP, sps_qp_bd_offset + s->slice_qp); +- } +- +- p1_apb_write(de, RPI_MODE, +- (0xFFFF << 0) + (0x0 << 16) + +- ((tile_x == s->num_tile_columns - 1) << 17) + +- ((tile_y == s->num_tile_rows - 1) << 18)); +- +- p1_apb_write(de, RPI_CONTROL, (ctb_col << 0) + (ctb_row << 16)); ++ return x; + } + + ////////////////////////////////////////////////////////////////////////////// +@@ -934,197 +902,256 @@ static void pre_slice_decode(struct rpiv + (sh->slice_cb_qp_offset & 31)); // CMD_QPOFF + } + +-////////////////////////////////////////////////////////////////////////////// +-// Write STATUS register with expected end CTU address of previous slice +- +-static void end_previous_slice(struct rpivid_dec_env *const de, +- const struct rpivid_dec_state *const s, +- const int ctb_addr_ts) +-{ +- int last_x = +- s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] % de->pic_width_in_ctbs_y; +- int last_y = +- s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] / de->pic_width_in_ctbs_y; +- +- p1_apb_write(de, RPI_STATUS, 1 + (last_x << 5) + (last_y << 18)); +-} +- +-static void wpp_pause(struct rpivid_dec_env *const de, int ctb_row) +-{ +- p1_apb_write(de, RPI_STATUS, (ctb_row << 18) + 0x25); +- p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); +- p1_apb_write(de, RPI_MODE, +- ctb_row == de->pic_height_in_ctbs_y - 1 ? +- 0x70000 : 0x30000); +- p1_apb_write(de, RPI_CONTROL, (ctb_row << 16) + 2); +-} +- +-static void wpp_end_previous_slice(struct rpivid_dec_env *const de, +- const struct rpivid_dec_state *const s, +- int ctb_addr_ts) +-{ +- int new_x = s->sh->slice_segment_addr % de->pic_width_in_ctbs_y; +- int new_y = s->sh->slice_segment_addr / de->pic_width_in_ctbs_y; +- int last_x = +- s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] % de->pic_width_in_ctbs_y; +- int last_y = +- s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] / de->pic_width_in_ctbs_y; +- +- if (de->wpp_entry_x < 2 && (de->wpp_entry_y < new_y || new_x > 2) && +- de->pic_width_in_ctbs_y > 2) +- wpp_pause(de, last_y); +- p1_apb_write(de, RPI_STATUS, 1 + (last_x << 5) + (last_y << 18)); +- if (new_x == 2 || (de->pic_width_in_ctbs_y == 2 && +- de->wpp_entry_y < new_y)) +- p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); ++static void write_slice(struct rpivid_dec_env *const de, ++ const struct rpivid_dec_state *const s, ++ const u32 slice_const, ++ const unsigned int ctb_col, ++ const unsigned int ctb_row) ++{ ++ const unsigned int cs = (1 << s->log2_ctb_size); ++ const unsigned int w_last = s->sps.pic_width_in_luma_samples & (cs - 1); ++ const unsigned int h_last = s->sps.pic_height_in_luma_samples & (cs - 1); ++ ++ p1_apb_write(de, RPI_SLICE, ++ slice_const | ++ ((ctb_col + 1 < s->ctb_width || !w_last ? ++ cs : w_last) << 17) | ++ ((ctb_row + 1 < s->ctb_height || !h_last ? ++ cs : h_last) << 24)); + } + +-////////////////////////////////////////////////////////////////////////////// +-// Wavefront mode ++#define PAUSE_MODE_WPP 1 ++#define PAUSE_MODE_TILE 0xffff + +-static void wpp_entry_point(struct rpivid_dec_env *const de, ++/* ++ * N.B. This can be called to fill in data from the previous slice so must not ++ * use any state data that may change from slice to slice (e.g. qp) ++ */ ++static void new_entry_point(struct rpivid_dec_env *const de, + const struct rpivid_dec_state *const s, +- const int do_bte, +- const int reset_qp_y, const int ctb_addr_ts) +-{ +- int ctb_size = 1 << s->log2_ctb_size; +- int ctb_addr_rs = s->ctb_addr_ts_to_rs[ctb_addr_ts]; +- +- int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->pic_width_in_ctbs_y; +- int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->pic_width_in_ctbs_y; ++ const bool do_bte, ++ const bool reset_qp_y, ++ const u32 pause_mode, ++ const unsigned int tile_x, ++ const unsigned int tile_y, ++ const unsigned int ctb_col, ++ const unsigned int ctb_row, ++ const unsigned int slice_qp, ++ const u32 slice_const) ++{ ++ const unsigned int endx = s->col_bd[tile_x + 1] - 1; ++ const unsigned int endy = (pause_mode == PAUSE_MODE_WPP) ? ++ ctb_row : s->row_bd[tile_y + 1] - 1; + +- int endx = de->pic_width_in_ctbs_y - 1; +- int endy = ctb_row; +- +- u8 slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, +- s->sps.pic_width_in_luma_samples, +- s->col_bd, s->num_tile_columns); +- u8 slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, +- s->sps.pic_height_in_luma_samples, +- s->row_bd, s->num_tile_rows); +- +- p1_apb_write(de, RPI_TILESTART, 0); +- p1_apb_write(de, RPI_TILEEND, endx + (endy << 16)); ++ p1_apb_write(de, RPI_TILESTART, ++ s->col_bd[tile_x] | (s->row_bd[tile_y] << 16)); ++ p1_apb_write(de, RPI_TILEEND, endx | (endy << 16)); + + if (do_bte) +- p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy << 16)); ++ p1_apb_write(de, RPI_BEGINTILEEND, endx | (endy << 16)); + +- write_slice(de, s, slice_w, +- ctb_row == de->pic_height_in_ctbs_y - 1 ? +- slice_h : ctb_size); ++ write_slice(de, s, slice_const, endx, endy); + + if (reset_qp_y) { + unsigned int sps_qp_bd_offset = + 6 * s->sps.bit_depth_luma_minus8; + +- p1_apb_write(de, RPI_QP, sps_qp_bd_offset + s->slice_qp); ++ p1_apb_write(de, RPI_QP, sps_qp_bd_offset + slice_qp); + } + + p1_apb_write(de, RPI_MODE, +- ctb_row == de->pic_height_in_ctbs_y - 1 ? +- 0x60001 : 0x20001); +- p1_apb_write(de, RPI_CONTROL, (ctb_col << 0) + (ctb_row << 16)); ++ pause_mode | ++ ((endx == s->ctb_width - 1) << 17) | ++ ((endy == s->ctb_height - 1) << 18)); ++ ++ p1_apb_write(de, RPI_CONTROL, (ctb_col << 0) | (ctb_row << 16)); ++ ++ de->entry_tile_x = tile_x; ++ de->entry_tile_y = tile_y; ++ de->entry_ctb_x = ctb_col; ++ de->entry_ctb_y = ctb_row; ++ de->entry_qp = slice_qp; ++ de->entry_slice = slice_const; + } + + ////////////////////////////////////////////////////////////////////////////// + // Wavefront mode + ++static void wpp_pause(struct rpivid_dec_env *const de, int ctb_row) ++{ ++ p1_apb_write(de, RPI_STATUS, (ctb_row << 18) | 0x25); ++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); ++ p1_apb_write(de, RPI_MODE, ++ ctb_row == de->pic_height_in_ctbs_y - 1 ? ++ 0x70000 : 0x30000); ++ p1_apb_write(de, RPI_CONTROL, (ctb_row << 16) + 2); ++} ++ ++static void wpp_entry_fill(struct rpivid_dec_env *const de, ++ const struct rpivid_dec_state *const s, ++ const unsigned int last_y) ++{ ++ const unsigned int last_x = s->ctb_width - 1; ++ ++ while (de->entry_ctb_y < last_y) { ++ /* wpp_entry_x/y set by wpp_entry_point */ ++ if (s->ctb_width > 2) ++ wpp_pause(de, de->entry_ctb_y); ++ p1_apb_write(de, RPI_STATUS, ++ (de->entry_ctb_y << 18) | (last_x << 5) | 2); ++ ++ /* if width == 1 then the saved state is the init one */ ++ if (s->ctb_width == 2) ++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); ++ else ++ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); ++ ++ new_entry_point(de, s, false, true, PAUSE_MODE_WPP, ++ 0, 0, 0, de->entry_ctb_y + 1, ++ de->entry_qp, de->entry_slice); ++ } ++} ++ ++static void wpp_end_previous_slice(struct rpivid_dec_env *const de, ++ const struct rpivid_dec_state *const s) ++{ ++ wpp_entry_fill(de, s, s->prev_ctb_y); ++ ++ if (de->entry_ctb_x < 2 && ++ (de->entry_ctb_y < s->start_ctb_y || s->start_ctb_x > 2) && ++ s->ctb_width > 2) ++ wpp_pause(de, s->prev_ctb_y); ++ p1_apb_write(de, RPI_STATUS, ++ 1 | (s->prev_ctb_x << 5) | (s->prev_ctb_y << 18)); ++ if (s->start_ctb_x == 2 || ++ (s->ctb_width == 2 && de->entry_ctb_y < s->start_ctb_y)) ++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); ++} ++ ++/* Only main profile supported so WPP => !Tiles which makes some of the ++ * next chunk code simpler ++ */ + static void wpp_decode_slice(struct rpivid_dec_env *const de, +- const struct rpivid_dec_state *const s, +- const struct v4l2_ctrl_hevc_slice_params *sh, +- int ctb_addr_ts) +-{ +- int i, reset_qp_y = 1; +- int indep = !s->dependent_slice_segment_flag; +- int ctb_col = s->sh->slice_segment_addr % de->pic_width_in_ctbs_y; ++ const struct rpivid_dec_state *const s) ++{ ++ bool reset_qp_y = true; ++ const bool indep = !s->dependent_slice_segment_flag; + +- if (ctb_addr_ts) +- wpp_end_previous_slice(de, s, ctb_addr_ts); ++ if (s->start_ts) ++ wpp_end_previous_slice(de, s); + pre_slice_decode(de, s); + write_bitstream(de, s); +- if (ctb_addr_ts == 0 || indep || de->pic_width_in_ctbs_y == 1) ++ ++ if (!s->start_ts || indep || s->ctb_width == 1) + write_prob(de, s); +- else if (ctb_col == 0) ++ else if (!s->start_ctb_x) + p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); + else +- reset_qp_y = 0; ++ reset_qp_y = false; ++ + program_slicecmds(de, s->slice_idx); + new_slice_segment(de, s); +- wpp_entry_point(de, s, indep, reset_qp_y, ctb_addr_ts); ++ new_entry_point(de, s, indep, reset_qp_y, PAUSE_MODE_WPP, ++ 0, 0, s->start_ctb_x, s->start_ctb_y, ++ s->slice_qp, slice_reg_const(s)); + +- for (i = 0; i < s->sh->num_entry_point_offsets; i++) { +- int ctb_addr_rs = s->ctb_addr_ts_to_rs[ctb_addr_ts]; +- int ctb_row = ctb_addr_rs / de->pic_width_in_ctbs_y; +- int last_x = de->pic_width_in_ctbs_y - 1; ++ if (s->frame_end) { ++ wpp_entry_fill(de, s, s->ctb_height - 1); ++ ++ if (de->entry_ctb_x < 2 && s->ctb_width > 2) ++ wpp_pause(de, s->ctb_height - 1); + +- if (de->pic_width_in_ctbs_y > 2) +- wpp_pause(de, ctb_row); + p1_apb_write(de, RPI_STATUS, +- (ctb_row << 18) + (last_x << 5) + 2); +- if (de->pic_width_in_ctbs_y == 2) +- p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); +- if (de->pic_width_in_ctbs_y == 1) +- write_prob(de, s); +- else +- p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); +- ctb_addr_ts += s->column_width[0]; +- wpp_entry_point(de, s, 0, 1, ctb_addr_ts); ++ 1 | ((s->ctb_width - 1) << 5) | ++ ((s->ctb_height - 1) << 18)); + } ++ + } + + ////////////////////////////////////////////////////////////////////////////// + // Tiles mode + ++static void tile_entry_fill(struct rpivid_dec_env *const de, ++ const struct rpivid_dec_state *const s, ++ const unsigned int last_tile_x, ++ const unsigned int last_tile_y) ++{ ++ while (de->entry_tile_y < last_tile_y || ++ (de->entry_tile_y == last_tile_y && ++ de->entry_tile_x < last_tile_x)) { ++ unsigned int t_x = de->entry_tile_x; ++ unsigned int t_y = de->entry_tile_y; ++ const unsigned int last_x = s->col_bd[t_x + 1] - 1; ++ const unsigned int last_y = s->row_bd[t_y + 1] - 1; ++ ++ p1_apb_write(de, RPI_STATUS, ++ 2 | (last_x << 5) | (last_y << 18)); ++ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); ++ ++ // Inc tile ++ if (++t_x >= s->tile_width) { ++ t_x = 0; ++ ++t_y; ++ } ++ ++ new_entry_point(de, s, false, true, PAUSE_MODE_TILE, ++ t_x, t_y, s->col_bd[t_x], s->row_bd[t_y], ++ de->entry_qp, de->entry_slice); ++ } ++} ++ ++/* ++ * Write STATUS register with expected end CTU address of previous slice ++ */ ++static void end_previous_slice(struct rpivid_dec_env *const de, ++ const struct rpivid_dec_state *const s) ++{ ++ tile_entry_fill(de, s, ++ ctb_to_tile_x(s, s->prev_ctb_x), ++ ctb_to_tile_y(s, s->prev_ctb_y)); ++ p1_apb_write(de, RPI_STATUS, ++ 1 | (s->prev_ctb_x << 5) | (s->prev_ctb_y << 18)); ++} ++ + static void decode_slice(struct rpivid_dec_env *const de, +- const struct rpivid_dec_state *const s, +- const struct v4l2_ctrl_hevc_slice_params *const sh, +- int ctb_addr_ts) ++ const struct rpivid_dec_state *const s) + { +- int i, reset_qp_y; ++ bool reset_qp_y; ++ unsigned int tile_x = ctb_to_tile_x(s, s->start_ctb_x); ++ unsigned int tile_y = ctb_to_tile_y(s, s->start_ctb_y); + +- if (ctb_addr_ts) +- end_previous_slice(de, s, ctb_addr_ts); ++ if (s->start_ts) ++ end_previous_slice(de, s); + + pre_slice_decode(de, s); + write_bitstream(de, s); + +-#if DEBUG_TRACE_P1_CMD +- if (p1_z < 256) { +- v4l2_info(&de->ctx->dev->v4l2_dev, +- "TS=%d, tile=%d/%d, dss=%d, flags=%#llx\n", +- ctb_addr_ts, s->tile_id[ctb_addr_ts], +- s->tile_id[ctb_addr_ts - 1], +- s->dependent_slice_segment_flag, sh->flags); +- } +-#endif +- +- reset_qp_y = ctb_addr_ts == 0 || +- s->tile_id[ctb_addr_ts] != s->tile_id[ctb_addr_ts - 1] || +- !s->dependent_slice_segment_flag; ++ reset_qp_y = !s->start_ts || ++ !s->dependent_slice_segment_flag || ++ tile_x != ctb_to_tile_x(s, s->prev_ctb_x) || ++ tile_y != ctb_to_tile_y(s, s->prev_ctb_y); + if (reset_qp_y) + write_prob(de, s); + + program_slicecmds(de, s->slice_idx); + new_slice_segment(de, s); + new_entry_point(de, s, !s->dependent_slice_segment_flag, reset_qp_y, +- ctb_addr_ts); +- +- for (i = 0; i < s->sh->num_entry_point_offsets; i++) { +- int ctb_addr_rs = s->ctb_addr_ts_to_rs[ctb_addr_ts]; +- int ctb_col = ctb_addr_rs % de->pic_width_in_ctbs_y; +- int ctb_row = ctb_addr_rs / de->pic_width_in_ctbs_y; +- int tile_x = ctb_to_tile(ctb_col, s->col_bd, +- s->num_tile_columns - 1); +- int tile_y = +- ctb_to_tile(ctb_row, s->row_bd, s->num_tile_rows - 1); +- int last_x = s->col_bd[tile_x + 1] - 1; +- int last_y = s->row_bd[tile_y + 1] - 1; ++ PAUSE_MODE_TILE, ++ tile_x, tile_y, s->start_ctb_x, s->start_ctb_y, ++ s->slice_qp, slice_reg_const(s)); + ++ /* ++ * If this is the last slice then fill in the other tile entries ++ * now, otherwise this will be done at the start of the next slice ++ * when it will be known where this slice finishes ++ */ ++ if (s->frame_end) { ++ tile_entry_fill(de, s, ++ s->tile_width - 1, ++ s->tile_height - 1); + p1_apb_write(de, RPI_STATUS, +- 2 + (last_x << 5) + (last_y << 18)); +- write_prob(de, s); +- ctb_addr_ts += s->column_width[tile_x] * s->row_height[tile_y]; +- new_entry_point(de, s, 0, 1, ctb_addr_ts); ++ 1 | ((s->ctb_width - 1) << 5) | ++ ((s->ctb_height - 1) << 18)); + } + } + +@@ -1132,13 +1159,12 @@ static void decode_slice(struct rpivid_d + // Scaling factors + + static void expand_scaling_list(const unsigned int size_id, +- const unsigned int matrix_id, u8 *const dst0, ++ u8 *const dst0, + const u8 *const src0, uint8_t dc) + { + u8 *d; + unsigned int x, y; + +- // FIXME: matrix_id is unused ? + switch (size_id) { + case 0: + memcpy(dst0, src0, 16); +@@ -1199,24 +1225,20 @@ static void populate_scaling_factors(con + unsigned int mid; + + for (mid = 0; mid < 6; mid++) +- expand_scaling_list(0, mid, +- de->scaling_factors + ++ expand_scaling_list(0, de->scaling_factors + + scaling_factor_offsets[0][mid], + sl->scaling_list_4x4[mid], 0); + for (mid = 0; mid < 6; mid++) +- expand_scaling_list(1, mid, +- de->scaling_factors + ++ expand_scaling_list(1, de->scaling_factors + + scaling_factor_offsets[1][mid], + sl->scaling_list_8x8[mid], 0); + for (mid = 0; mid < 6; mid++) +- expand_scaling_list(2, mid, +- de->scaling_factors + ++ expand_scaling_list(2, de->scaling_factors + + scaling_factor_offsets[2][mid], + sl->scaling_list_16x16[mid], + sl->scaling_list_dc_coef_16x16[mid]); +- for (mid = 0; mid < 2; mid += 1) +- expand_scaling_list(3, mid, +- de->scaling_factors + ++ for (mid = 0; mid < 2; mid++) ++ expand_scaling_list(3, de->scaling_factors + + scaling_factor_offsets[3][mid], + sl->scaling_list_32x32[mid], + sl->scaling_list_dc_coef_32x32[mid]); +@@ -1228,8 +1250,6 @@ static void free_ps_info(struct rpivid_d + s->ctb_addr_rs_to_ts = NULL; + kfree(s->ctb_addr_ts_to_rs); + s->ctb_addr_ts_to_rs = NULL; +- kfree(s->tile_id); +- s->tile_id = NULL; + + kfree(s->col_bd); + s->col_bd = NULL; +@@ -1237,10 +1257,52 @@ static void free_ps_info(struct rpivid_d + s->row_bd = NULL; + } + ++static unsigned int tile_width(const struct rpivid_dec_state *const s, ++ const unsigned int t_x) ++{ ++ return s->col_bd[t_x + 1] - s->col_bd[t_x]; ++} ++ ++static unsigned int tile_height(const struct rpivid_dec_state *const s, ++ const unsigned int t_y) ++{ ++ return s->row_bd[t_y + 1] - s->row_bd[t_y]; ++} ++ ++static void fill_rs_to_ts(struct rpivid_dec_state *const s) ++{ ++ unsigned int ts = 0; ++ unsigned int t_y; ++ unsigned int tr_rs = 0; ++ ++ for (t_y = 0; t_y != s->tile_height; ++t_y) { ++ const unsigned int t_h = tile_height(s, t_y); ++ unsigned int t_x; ++ unsigned int tc_rs = tr_rs; ++ ++ for (t_x = 0; t_x != s->tile_width; ++t_x) { ++ const unsigned int t_w = tile_width(s, t_x); ++ unsigned int y; ++ unsigned int rs = tc_rs; ++ ++ for (y = 0; y != t_h; ++y) { ++ unsigned int x; ++ ++ for (x = 0; x != t_w; ++x) { ++ s->ctb_addr_rs_to_ts[rs + x] = ts; ++ s->ctb_addr_ts_to_rs[ts] = rs + x; ++ ++ts; ++ } ++ rs += s->ctb_width; ++ } ++ tc_rs += t_w; ++ } ++ tr_rs += t_h * s->ctb_width; ++ } ++} ++ + static int updated_ps(struct rpivid_dec_state *const s) + { +- unsigned int ctb_addr_rs; +- int j, x, y, tile_id; + unsigned int i; + + free_ps_info(s); +@@ -1259,104 +1321,49 @@ static int updated_ps(struct rpivid_dec_ + + // Inferred parameters + ++ s->ctb_addr_rs_to_ts = kmalloc_array(s->ctb_size, ++ sizeof(*s->ctb_addr_rs_to_ts), ++ GFP_KERNEL); ++ s->ctb_addr_ts_to_rs = kmalloc_array(s->ctb_size, ++ sizeof(*s->ctb_addr_ts_to_rs), ++ GFP_KERNEL); ++ + if (!(s->pps.flags & V4L2_HEVC_PPS_FLAG_TILES_ENABLED)) { +- s->num_tile_columns = 1; +- s->num_tile_rows = 1; +- s->column_width[0] = s->ctb_width; +- s->row_height[0] = s->ctb_height; ++ s->tile_width = 1; ++ s->tile_height = 1; + } else { +- s->num_tile_columns = s->pps.num_tile_columns_minus1 + 1; +- s->num_tile_rows = s->pps.num_tile_rows_minus1 + 1; +- for (i = 0; i < s->num_tile_columns; ++i) +- s->column_width[i] = s->pps.column_width_minus1[i] + 1; +- for (i = 0; i < s->num_tile_rows; ++i) +- s->row_height[i] = s->pps.row_height_minus1[i] + 1; ++ s->tile_width = s->pps.num_tile_columns_minus1 + 1; ++ s->tile_height = s->pps.num_tile_rows_minus1 + 1; + } + +- s->col_bd = kmalloc((s->num_tile_columns + 1) * sizeof(*s->col_bd), ++ s->col_bd = kmalloc((s->tile_width + 1) * sizeof(*s->col_bd), + GFP_KERNEL); +- s->row_bd = kmalloc((s->num_tile_rows + 1) * sizeof(*s->row_bd), ++ s->row_bd = kmalloc((s->tile_height + 1) * sizeof(*s->row_bd), + GFP_KERNEL); + + s->col_bd[0] = 0; +- for (i = 0; i < s->num_tile_columns; i++) +- s->col_bd[i + 1] = s->col_bd[i] + s->column_width[i]; ++ for (i = 1; i < s->tile_width; i++) ++ s->col_bd[i] = s->col_bd[i - 1] + ++ s->pps.column_width_minus1[i - 1] + 1; ++ s->col_bd[s->tile_width] = s->ctb_width; + + s->row_bd[0] = 0; +- for (i = 0; i < s->num_tile_rows; i++) +- s->row_bd[i + 1] = s->row_bd[i] + s->row_height[i]; ++ for (i = 1; i < s->tile_height; i++) ++ s->row_bd[i] = s->row_bd[i - 1] + ++ s->pps.row_height_minus1[i - 1] + 1; ++ s->row_bd[s->tile_height] = s->ctb_height; + +- s->ctb_addr_rs_to_ts = kmalloc_array(s->ctb_size, +- sizeof(*s->ctb_addr_rs_to_ts), +- GFP_KERNEL); +- s->ctb_addr_ts_to_rs = kmalloc_array(s->ctb_size, +- sizeof(*s->ctb_addr_ts_to_rs), +- GFP_KERNEL); +- s->tile_id = kmalloc_array(s->ctb_size, sizeof(*s->tile_id), +- GFP_KERNEL); +- +- for (ctb_addr_rs = 0; ctb_addr_rs < s->ctb_size; ctb_addr_rs++) { +- int tb_x = ctb_addr_rs % s->ctb_width; +- int tb_y = ctb_addr_rs / s->ctb_width; +- int tile_x = 0; +- int tile_y = 0; +- int val = 0; +- +- for (i = 0; i < s->num_tile_columns; i++) { +- if (tb_x < s->col_bd[i + 1]) { +- tile_x = i; +- break; +- } +- } +- +- for (i = 0; i < s->num_tile_rows; i++) { +- if (tb_y < s->row_bd[i + 1]) { +- tile_y = i; +- break; +- } +- } +- +- for (i = 0; i < tile_x; i++) +- val += s->row_height[tile_y] * s->column_width[i]; +- for (i = 0; i < tile_y; i++) +- val += s->ctb_width * s->row_height[i]; +- +- val += (tb_y - s->row_bd[tile_y]) * s->column_width[tile_x] + +- tb_x - s->col_bd[tile_x]; +- +- s->ctb_addr_rs_to_ts[ctb_addr_rs] = val; +- s->ctb_addr_ts_to_rs[val] = ctb_addr_rs; +- } +- +- for (j = 0, tile_id = 0; j < s->num_tile_rows; j++) +- for (i = 0; i < s->num_tile_columns; i++, tile_id++) +- for (y = s->row_bd[j]; y < s->row_bd[j + 1]; y++) +- for (x = s->col_bd[i]; +- x < s->col_bd[i + 1]; +- x++) +- s->tile_id[s->ctb_addr_rs_to_ts +- [y * s->ctb_width + +- x]] = tile_id; ++ fill_rs_to_ts(s); + + return 0; + } + +-static int frame_end(struct rpivid_dev *const dev, +- struct rpivid_dec_env *const de, +- const struct rpivid_dec_state *const s) +-{ +- const unsigned int last_x = s->col_bd[s->num_tile_columns] - 1; +- const unsigned int last_y = s->row_bd[s->num_tile_rows] - 1; +- size_t cmd_size; +- +- if (s->pps.flags & V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED) { +- if (de->wpp_entry_x < 2 && de->pic_width_in_ctbs_y > 2) +- wpp_pause(de, last_y); +- } +- p1_apb_write(de, RPI_STATUS, 1 + (last_x << 5) + (last_y << 18)); +- ++static int write_cmd_buffer(struct rpivid_dev *const dev, ++ struct rpivid_dec_env *const de, ++ const struct rpivid_dec_state *const s) ++{ + // Copy commands out to dma buf +- cmd_size = de->cmd_len * sizeof(de->cmd_fifo[0]); ++ const size_t cmd_size = de->cmd_len * sizeof(de->cmd_fifo[0]); + + if (!de->cmd_copy_gptr->ptr || cmd_size > de->cmd_copy_gptr->size) { + size_t cmd_alloc = round_up_size(cmd_size); +@@ -1521,18 +1528,19 @@ static void rpivid_h265_setup(struct rpi + struct rpivid_q_aux *dpb_q_aux[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + struct rpivid_dec_state *const s = ctx->state; + struct vb2_queue *vq; +- struct rpivid_dec_env *de; +- int ctb_addr_ts; ++ struct rpivid_dec_env *de = ctx->dec0; ++ unsigned int prev_rs; + unsigned int i; + int use_aux; + bool slice_temporal_mvp; + ++ xtrace_in(dev, de); ++ + pred_weight_table = &sh->pred_weight_table; + + s->frame_end = + ((run->src->flags & V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF) == 0); + +- de = ctx->dec0; + slice_temporal_mvp = (sh->flags & + V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED); + +@@ -1662,6 +1670,13 @@ static void rpivid_h265_setup(struct rpi + s->sps.pic_height_in_luma_samples); + goto fail; + } ++ if ((s->tile_width != 1 || s->tile_height != 1) && ++ (s->pps.flags & ++ V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED)) { ++ v4l2_warn(&dev->v4l2_dev, ++ "Tiles + WPP not supported\n"); ++ goto fail; ++ } + + // Fill in ref planes with our address s.t. if we mess + // up refs somehow then we still have a valid address +@@ -1760,15 +1775,24 @@ static void rpivid_h265_setup(struct rpi + if (s->sps.flags & V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED) + populate_scaling_factors(run, de, s); + +- ctb_addr_ts = s->ctb_addr_rs_to_ts[sh->slice_segment_addr]; ++ // Calc all the random coord info to avoid repeated conversion in/out ++ s->start_ts = s->ctb_addr_rs_to_ts[sh->slice_segment_addr]; ++ s->start_ctb_x = sh->slice_segment_addr % de->pic_width_in_ctbs_y; ++ s->start_ctb_y = sh->slice_segment_addr / de->pic_width_in_ctbs_y; ++ // Last CTB of previous slice ++ prev_rs = !s->start_ts ? 0 : s->ctb_addr_ts_to_rs[s->start_ts - 1]; ++ s->prev_ctb_x = prev_rs % de->pic_width_in_ctbs_y; ++ s->prev_ctb_y = prev_rs / de->pic_width_in_ctbs_y; + + if ((s->pps.flags & V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED)) +- wpp_decode_slice(de, s, sh, ctb_addr_ts); ++ wpp_decode_slice(de, s); + else +- decode_slice(de, s, sh, ctb_addr_ts); ++ decode_slice(de, s); + +- if (!s->frame_end) ++ if (!s->frame_end) { ++ xtrace_ok(dev, de); + return; ++ } + + // Frame end + memset(dpb_q_aux, 0, +@@ -1776,8 +1800,9 @@ static void rpivid_h265_setup(struct rpi + /* + * Need Aux ents for all (ref) DPB ents if temporal MV could + * be enabled for any pic +- * ** At the moment we have aux ents for all pics whether or not +- * they are ref ++ * ** At the moment we create aux ents for all pics whether or not ++ * they are ref - they should then be discarded by the DPB-aux ++ * garbage collection code + */ + use_aux = ((s->sps.flags & + V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED) != 0); +@@ -1795,7 +1820,7 @@ static void rpivid_h265_setup(struct rpi + } + + // v4l2_info(&dev->v4l2_dev, "rpivid_h265_end of frame\n"); +- if (frame_end(dev, de, s)) ++ if (write_cmd_buffer(dev, de, s)) + goto fail; + + for (i = 0; i < sh->num_active_dpb_entries; ++i) { +@@ -1876,6 +1901,7 @@ static void rpivid_h265_setup(struct rpi + } + + de->state = RPIVID_DECODE_PHASE1; ++ xtrace_ok(dev, de); + return; + + fail: +@@ -1883,6 +1909,7 @@ fail: + // Actual error reporting happens in Trigger + de->state = s->frame_end ? RPIVID_DECODE_ERROR_DONE : + RPIVID_DECODE_ERROR_CONTINUE; ++ xtrace_fail(dev, de); + } + + ////////////////////////////////////////////////////////////////////////////// +@@ -2210,6 +2237,10 @@ static int rpivid_h265_start(struct rpiv + size_t pu_alloc; + size_t coeff_alloc; + ++#if DEBUG_TRACE_P1_CMD ++ p1_z = 0; ++#endif ++ + // Generate a sanitised WxH for memory alloc + // Assume HD if unset + if (w == 0) |