From 0962d9e4a57c3faa29000c63c1cb28ad39a6c80c Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 21 May 2020 11:49:37 +0100 Subject: [PATCH] media: rpivid: Remove the need to have num_entry_points set VAAPI H265 has num entry points but never sets it. Allow a VAAPI shim to work without requiring rewriting the VAAPI driver. num_entry_points can be calculated from the slice_segment_addr of the next slice so delay processing until we have that. Also includes some minor cosmetics. Signed-off-by: John Cox --- drivers/staging/media/rpivid/rpivid_h265.c | 699 +++++++++++---------- 1 file changed, 365 insertions(+), 334 deletions(-) --- a/drivers/staging/media/rpivid/rpivid_h265.c +++ b/drivers/staging/media/rpivid/rpivid_h265.c @@ -202,8 +202,17 @@ struct rpivid_dec_env { unsigned int dpbno_col; u32 reg_slicestart; int collocated_from_l0_flag; - unsigned int wpp_entry_x; - unsigned int wpp_entry_y; + /* + * Last CTB/Tile X,Y processed by (wpp_)entry_point + * Could be in _state as P0 only but needs updating where _state + * is const + */ + unsigned int entry_ctb_x; + unsigned int entry_ctb_y; + unsigned int entry_tile_x; + unsigned int entry_tile_y; + unsigned int entry_qp; + u32 entry_slice; u32 rpi_config2; u32 rpi_framesize; @@ -239,22 +248,17 @@ struct rpivid_dec_state { struct v4l2_ctrl_hevc_pps pps; // Helper vars & tables derived from sps/pps - unsigned int log2_ctb_size; /* log2 width of a CTB */ - unsigned int ctb_width; /* Width in CTBs */ - unsigned int ctb_height; /* Height in CTBs */ - unsigned int ctb_size; /* Pic area in CTBs */ - unsigned int num_tile_columns; - unsigned int num_tile_rows; - u8 column_width[member_size(struct v4l2_ctrl_hevc_pps, - column_width_minus1)]; - u8 row_height[member_size(struct v4l2_ctrl_hevc_pps, - row_height_minus1)]; + unsigned int log2_ctb_size; /* log2 width of a CTB */ + unsigned int ctb_width; /* Width in CTBs */ + unsigned int ctb_height; /* Height in CTBs */ + unsigned int ctb_size; /* Pic area in CTBs */ + unsigned int tile_width; /* Width in tiles */ + unsigned int tile_height; /* Height in tiles */ int *col_bd; int *row_bd; int *ctb_addr_rs_to_ts; int *ctb_addr_ts_to_rs; - int *tile_id; // Aux starage for DPB // Hold refs @@ -274,6 +278,12 @@ struct rpivid_dec_state { unsigned int slice_qp; unsigned int max_num_merge_cand; // 0 if I-slice bool dependent_slice_segment_flag; + + unsigned int start_ts; /* slice_segment_addr -> ts */ + unsigned int start_ctb_x; /* CTB X,Y of start_ts */ + unsigned int start_ctb_y; + unsigned int prev_ctb_x; /* CTB X,Y of start_ts - 1 */ + unsigned int prev_ctb_y; }; static inline int clip_int(const int x, const int lo, const int hi) @@ -319,15 +329,16 @@ static int ctb_to_tile(unsigned int ctb, return i - 1; } -static int ctb_to_slice_w_h(unsigned int ctb, int ctb_size, int width, - unsigned int *bd, int num) +static unsigned int ctb_to_tile_x(const struct rpivid_dec_state *const s, + const unsigned int ctb_x) { - if (ctb < bd[num - 1]) - return ctb_size; - else if (width % ctb_size) - return width % ctb_size; - else - return ctb_size; + return ctb_to_tile(ctb_x, s->col_bd, s->tile_width); +} + +static unsigned int ctb_to_tile_y(const struct rpivid_dec_state *const s, + const unsigned int ctb_y) +{ + return ctb_to_tile(ctb_y, s->row_bd, s->tile_height); } static void aux_q_free(struct rpivid_ctx *const ctx, @@ -532,6 +543,15 @@ static void write_prob(struct rpivid_dec p1_apb_write(de, 0x1000 + i, dst[i] + (dst[i + 1] << 8) + (dst[i + 2] << 16) + (dst[i + 3] << 24)); + + /* + * Having written the prob array back it up + * This is not always needed but is a small overhead that simplifies + * (and speeds up) some multi-tile & WPP scenarios + * There are no scenarios where having written a prob we ever want + * a previous (non-initial) state back + */ + p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); } static void write_scaling_factors(struct rpivid_dec_env *const de) @@ -552,8 +572,8 @@ static inline __u32 dma_to_axi_addr(dma_ static void write_bitstream(struct rpivid_dec_env *const de, const struct rpivid_dec_state *const s) { - // Note that FFmpeg removes emulation prevention bytes, so this is - // matched in the configuration here. + // Note that FFmpeg V4L2 does not remove emulation prevention bytes, + // so this is matched in the configuration here. // Whether that is the correct behaviour or not is not clear in the // spec. const int rpi_use_emu = 1; @@ -579,78 +599,26 @@ static void write_bitstream(struct rpivi ////////////////////////////////////////////////////////////////////////////// -static void write_slice(struct rpivid_dec_env *const de, - const struct rpivid_dec_state *const s, - const unsigned int slice_w, - const unsigned int slice_h) -{ - u32 u32 = (s->sh->slice_type << 12) + - (((s->sh->flags & - V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA) != 0) - << 14) + - (((s->sh->flags & - V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA) != 0) - << 15) + - (slice_w << 17) + (slice_h << 24); - - u32 |= (s->max_num_merge_cand << 0) + (s->nb_refs[L0] << 4) + - (s->nb_refs[L1] << 8); - - if (s->sh->slice_type == HEVC_SLICE_B) - u32 |= ((s->sh->flags & - V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO) != 0) - << 16; - p1_apb_write(de, RPI_SLICE, u32); -} - -////////////////////////////////////////////////////////////////////////////// -// Tiles mode - -static void new_entry_point(struct rpivid_dec_env *const de, - const struct rpivid_dec_state *const s, - const int do_bte, - const int reset_qp_y, const int ctb_addr_ts) +/* + * The slice constant part of the slice register - width and height need to + * be ORed in later as they are per-tile / WPP-row + */ +static u32 slice_reg_const(const struct rpivid_dec_state *const s) { - int ctb_col = s->ctb_addr_ts_to_rs[ctb_addr_ts] % - de->pic_width_in_ctbs_y; - int ctb_row = s->ctb_addr_ts_to_rs[ctb_addr_ts] / - de->pic_width_in_ctbs_y; - - int tile_x = ctb_to_tile(ctb_col, s->col_bd, s->num_tile_columns); - int tile_y = ctb_to_tile(ctb_row, s->row_bd, s->num_tile_rows); - - int endx = s->col_bd[tile_x + 1] - 1; - int endy = s->row_bd[tile_y + 1] - 1; - - u8 slice_w = ctb_to_slice_w_h(ctb_col, 1 << s->log2_ctb_size, - s->sps.pic_width_in_luma_samples, - s->col_bd, s->num_tile_columns); - u8 slice_h = ctb_to_slice_w_h(ctb_row, 1 << s->log2_ctb_size, - s->sps.pic_height_in_luma_samples, - s->row_bd, s->num_tile_rows); - - p1_apb_write(de, RPI_TILESTART, - s->col_bd[tile_x] + (s->row_bd[tile_y] << 16)); - p1_apb_write(de, RPI_TILEEND, endx + (endy << 16)); - - if (do_bte) - p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy << 16)); + u32 x = (s->max_num_merge_cand << 0) | + (s->nb_refs[L0] << 4) | + (s->nb_refs[L1] << 8) | + (s->sh->slice_type << 12); + + if (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA) + x |= BIT(14); + if (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA) + x |= BIT(15); + if (s->sh->slice_type == HEVC_SLICE_B && + (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO)) + x |= BIT(16); - write_slice(de, s, slice_w, slice_h); - - if (reset_qp_y) { - unsigned int sps_qp_bd_offset = - 6 * s->sps.bit_depth_luma_minus8; - - p1_apb_write(de, RPI_QP, sps_qp_bd_offset + s->slice_qp); - } - - p1_apb_write(de, RPI_MODE, - (0xFFFF << 0) + (0x0 << 16) + - ((tile_x == s->num_tile_columns - 1) << 17) + - ((tile_y == s->num_tile_rows - 1) << 18)); - - p1_apb_write(de, RPI_CONTROL, (ctb_col << 0) + (ctb_row << 16)); + return x; } ////////////////////////////////////////////////////////////////////////////// @@ -934,197 +902,256 @@ static void pre_slice_decode(struct rpiv (sh->slice_cb_qp_offset & 31)); // CMD_QPOFF } -////////////////////////////////////////////////////////////////////////////// -// Write STATUS register with expected end CTU address of previous slice - -static void end_previous_slice(struct rpivid_dec_env *const de, - const struct rpivid_dec_state *const s, - const int ctb_addr_ts) -{ - int last_x = - s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] % de->pic_width_in_ctbs_y; - int last_y = - s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] / de->pic_width_in_ctbs_y; - - p1_apb_write(de, RPI_STATUS, 1 + (last_x << 5) + (last_y << 18)); -} - -static void wpp_pause(struct rpivid_dec_env *const de, int ctb_row) -{ - p1_apb_write(de, RPI_STATUS, (ctb_row << 18) + 0x25); - p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); - p1_apb_write(de, RPI_MODE, - ctb_row == de->pic_height_in_ctbs_y - 1 ? - 0x70000 : 0x30000); - p1_apb_write(de, RPI_CONTROL, (ctb_row << 16) + 2); -} - -static void wpp_end_previous_slice(struct rpivid_dec_env *const de, - const struct rpivid_dec_state *const s, - int ctb_addr_ts) -{ - int new_x = s->sh->slice_segment_addr % de->pic_width_in_ctbs_y; - int new_y = s->sh->slice_segment_addr / de->pic_width_in_ctbs_y; - int last_x = - s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] % de->pic_width_in_ctbs_y; - int last_y = - s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] / de->pic_width_in_ctbs_y; - - if (de->wpp_entry_x < 2 && (de->wpp_entry_y < new_y || new_x > 2) && - de->pic_width_in_ctbs_y > 2) - wpp_pause(de, last_y); - p1_apb_write(de, RPI_STATUS, 1 + (last_x << 5) + (last_y << 18)); - if (new_x == 2 || (de->pic_width_in_ctbs_y == 2 && - de->wpp_entry_y < new_y)) - p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); +static void write_slice(struct rpivid_dec_env *const de, + const struct rpivid_dec_state *const s, + const u32 slice_const, + const unsigned int ctb_col, + const unsigned int ctb_row) +{ + const unsigned int cs = (1 << s->log2_ctb_size); + const unsigned int w_last = s->sps.pic_width_in_luma_samples & (cs - 1); + const unsigned int h_last = s->sps.pic_height_in_luma_samples & (cs - 1); + + p1_apb_write(de, RPI_SLICE, + slice_const | + ((ctb_col + 1 < s->ctb_width || !w_last ? + cs : w_last) << 17) | + ((ctb_row + 1 < s->ctb_height || !h_last ? + cs : h_last) << 24)); } -////////////////////////////////////////////////////////////////////////////// -// Wavefront mode +#define PAUSE_MODE_WPP 1 +#define PAUSE_MODE_TILE 0xffff -static void wpp_entry_point(struct rpivid_dec_env *const de, +/* + * N.B. This can be called to fill in data from the previous slice so must not + * use any state data that may change from slice to slice (e.g. qp) + */ +static void new_entry_point(struct rpivid_dec_env *const de, const struct rpivid_dec_state *const s, - const int do_bte, - const int reset_qp_y, const int ctb_addr_ts) -{ - int ctb_size = 1 << s->log2_ctb_size; - int ctb_addr_rs = s->ctb_addr_ts_to_rs[ctb_addr_ts]; - - int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->pic_width_in_ctbs_y; - int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->pic_width_in_ctbs_y; + const bool do_bte, + const bool reset_qp_y, + const u32 pause_mode, + const unsigned int tile_x, + const unsigned int tile_y, + const unsigned int ctb_col, + const unsigned int ctb_row, + const unsigned int slice_qp, + const u32 slice_const) +{ + const unsigned int endx = s->col_bd[tile_x + 1] - 1; + const unsigned int endy = (pause_mode == PAUSE_MODE_WPP) ? + ctb_row : s->row_bd[tile_y + 1] - 1; - int endx = de->pic_width_in_ctbs_y - 1; - int endy = ctb_row; - - u8 slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, - s->sps.pic_width_in_luma_samples, - s->col_bd, s->num_tile_columns); - u8 slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, - s->sps.pic_height_in_luma_samples, - s->row_bd, s->num_tile_rows); - - p1_apb_write(de, RPI_TILESTART, 0); - p1_apb_write(de, RPI_TILEEND, endx + (endy << 16)); + p1_apb_write(de, RPI_TILESTART, + s->col_bd[tile_x] | (s->row_bd[tile_y] << 16)); + p1_apb_write(de, RPI_TILEEND, endx | (endy << 16)); if (do_bte) - p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy << 16)); + p1_apb_write(de, RPI_BEGINTILEEND, endx | (endy << 16)); - write_slice(de, s, slice_w, - ctb_row == de->pic_height_in_ctbs_y - 1 ? - slice_h : ctb_size); + write_slice(de, s, slice_const, endx, endy); if (reset_qp_y) { unsigned int sps_qp_bd_offset = 6 * s->sps.bit_depth_luma_minus8; - p1_apb_write(de, RPI_QP, sps_qp_bd_offset + s->slice_qp); + p1_apb_write(de, RPI_QP, sps_qp_bd_offset + slice_qp); } p1_apb_write(de, RPI_MODE, - ctb_row == de->pic_height_in_ctbs_y - 1 ? - 0x60001 : 0x20001); - p1_apb_write(de, RPI_CONTROL, (ctb_col << 0) + (ctb_row << 16)); + pause_mode | + ((endx == s->ctb_width - 1) << 17) | + ((endy == s->ctb_height - 1) << 18)); + + p1_apb_write(de, RPI_CONTROL, (ctb_col << 0) | (ctb_row << 16)); + + de->entry_tile_x = tile_x; + de->entry_tile_y = tile_y; + de->entry_ctb_x = ctb_col; + de->entry_ctb_y = ctb_row; + de->entry_qp = slice_qp; + de->entry_slice = slice_const; } ////////////////////////////////////////////////////////////////////////////// // Wavefront mode +static void wpp_pause(struct rpivid_dec_env *const de, int ctb_row) +{ + p1_apb_write(de, RPI_STATUS, (ctb_row << 18) | 0x25); + p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); + p1_apb_write(de, RPI_MODE, + ctb_row == de->pic_height_in_ctbs_y - 1 ? + 0x70000 : 0x30000); + p1_apb_write(de, RPI_CONTROL, (ctb_row << 16) + 2); +} + +static void wpp_entry_fill(struct rpivid_dec_env *const de, + const struct rpivid_dec_state *const s, + const unsigned int last_y) +{ + const unsigned int last_x = s->ctb_width - 1; + + while (de->entry_ctb_y < last_y) { + /* wpp_entry_x/y set by wpp_entry_point */ + if (s->ctb_width > 2) + wpp_pause(de, de->entry_ctb_y); + p1_apb_write(de, RPI_STATUS, + (de->entry_ctb_y << 18) | (last_x << 5) | 2); + + /* if width == 1 then the saved state is the init one */ + if (s->ctb_width == 2) + p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); + else + p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); + + new_entry_point(de, s, false, true, PAUSE_MODE_WPP, + 0, 0, 0, de->entry_ctb_y + 1, + de->entry_qp, de->entry_slice); + } +} + +static void wpp_end_previous_slice(struct rpivid_dec_env *const de, + const struct rpivid_dec_state *const s) +{ + wpp_entry_fill(de, s, s->prev_ctb_y); + + if (de->entry_ctb_x < 2 && + (de->entry_ctb_y < s->start_ctb_y || s->start_ctb_x > 2) && + s->ctb_width > 2) + wpp_pause(de, s->prev_ctb_y); + p1_apb_write(de, RPI_STATUS, + 1 | (s->prev_ctb_x << 5) | (s->prev_ctb_y << 18)); + if (s->start_ctb_x == 2 || + (s->ctb_width == 2 && de->entry_ctb_y < s->start_ctb_y)) + p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); +} + +/* Only main profile supported so WPP => !Tiles which makes some of the + * next chunk code simpler + */ static void wpp_decode_slice(struct rpivid_dec_env *const de, - const struct rpivid_dec_state *const s, - const struct v4l2_ctrl_hevc_slice_params *sh, - int ctb_addr_ts) -{ - int i, reset_qp_y = 1; - int indep = !s->dependent_slice_segment_flag; - int ctb_col = s->sh->slice_segment_addr % de->pic_width_in_ctbs_y; + const struct rpivid_dec_state *const s) +{ + bool reset_qp_y = true; + const bool indep = !s->dependent_slice_segment_flag; - if (ctb_addr_ts) - wpp_end_previous_slice(de, s, ctb_addr_ts); + if (s->start_ts) + wpp_end_previous_slice(de, s); pre_slice_decode(de, s); write_bitstream(de, s); - if (ctb_addr_ts == 0 || indep || de->pic_width_in_ctbs_y == 1) + + if (!s->start_ts || indep || s->ctb_width == 1) write_prob(de, s); - else if (ctb_col == 0) + else if (!s->start_ctb_x) p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); else - reset_qp_y = 0; + reset_qp_y = false; + program_slicecmds(de, s->slice_idx); new_slice_segment(de, s); - wpp_entry_point(de, s, indep, reset_qp_y, ctb_addr_ts); + new_entry_point(de, s, indep, reset_qp_y, PAUSE_MODE_WPP, + 0, 0, s->start_ctb_x, s->start_ctb_y, + s->slice_qp, slice_reg_const(s)); - for (i = 0; i < s->sh->num_entry_point_offsets; i++) { - int ctb_addr_rs = s->ctb_addr_ts_to_rs[ctb_addr_ts]; - int ctb_row = ctb_addr_rs / de->pic_width_in_ctbs_y; - int last_x = de->pic_width_in_ctbs_y - 1; + if (s->frame_end) { + wpp_entry_fill(de, s, s->ctb_height - 1); + + if (de->entry_ctb_x < 2 && s->ctb_width > 2) + wpp_pause(de, s->ctb_height - 1); - if (de->pic_width_in_ctbs_y > 2) - wpp_pause(de, ctb_row); p1_apb_write(de, RPI_STATUS, - (ctb_row << 18) + (last_x << 5) + 2); - if (de->pic_width_in_ctbs_y == 2) - p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); - if (de->pic_width_in_ctbs_y == 1) - write_prob(de, s); - else - p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); - ctb_addr_ts += s->column_width[0]; - wpp_entry_point(de, s, 0, 1, ctb_addr_ts); + 1 | ((s->ctb_width - 1) << 5) | + ((s->ctb_height - 1) << 18)); } + } ////////////////////////////////////////////////////////////////////////////// // Tiles mode +static void tile_entry_fill(struct rpivid_dec_env *const de, + const struct rpivid_dec_state *const s, + const unsigned int last_tile_x, + const unsigned int last_tile_y) +{ + while (de->entry_tile_y < last_tile_y || + (de->entry_tile_y == last_tile_y && + de->entry_tile_x < last_tile_x)) { + unsigned int t_x = de->entry_tile_x; + unsigned int t_y = de->entry_tile_y; + const unsigned int last_x = s->col_bd[t_x + 1] - 1; + const unsigned int last_y = s->row_bd[t_y + 1] - 1; + + p1_apb_write(de, RPI_STATUS, + 2 | (last_x << 5) | (last_y << 18)); + p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); + + // Inc tile + if (++t_x >= s->tile_width) { + t_x = 0; + ++t_y; + } + + new_entry_point(de, s, false, true, PAUSE_MODE_TILE, + t_x, t_y, s->col_bd[t_x], s->row_bd[t_y], + de->entry_qp, de->entry_slice); + } +} + +/* + * Write STATUS register with expected end CTU address of previous slice + */ +static void end_previous_slice(struct rpivid_dec_env *const de, + const struct rpivid_dec_state *const s) +{ + tile_entry_fill(de, s, + ctb_to_tile_x(s, s->prev_ctb_x), + ctb_to_tile_y(s, s->prev_ctb_y)); + p1_apb_write(de, RPI_STATUS, + 1 | (s->prev_ctb_x << 5) | (s->prev_ctb_y << 18)); +} + static void decode_slice(struct rpivid_dec_env *const de, - const struct rpivid_dec_state *const s, - const struct v4l2_ctrl_hevc_slice_params *const sh, - int ctb_addr_ts) + const struct rpivid_dec_state *const s) { - int i, reset_qp_y; + bool reset_qp_y; + unsigned int tile_x = ctb_to_tile_x(s, s->start_ctb_x); + unsigned int tile_y = ctb_to_tile_y(s, s->start_ctb_y); - if (ctb_addr_ts) - end_previous_slice(de, s, ctb_addr_ts); + if (s->start_ts) + end_previous_slice(de, s); pre_slice_decode(de, s); write_bitstream(de, s); -#if DEBUG_TRACE_P1_CMD - if (p1_z < 256) { - v4l2_info(&de->ctx->dev->v4l2_dev, - "TS=%d, tile=%d/%d, dss=%d, flags=%#llx\n", - ctb_addr_ts, s->tile_id[ctb_addr_ts], - s->tile_id[ctb_addr_ts - 1], - s->dependent_slice_segment_flag, sh->flags); - } -#endif - - reset_qp_y = ctb_addr_ts == 0 || - s->tile_id[ctb_addr_ts] != s->tile_id[ctb_addr_ts - 1] || - !s->dependent_slice_segment_flag; + reset_qp_y = !s->start_ts || + !s->dependent_slice_segment_flag || + tile_x != ctb_to_tile_x(s, s->prev_ctb_x) || + tile_y != ctb_to_tile_y(s, s->prev_ctb_y); if (reset_qp_y) write_prob(de, s); program_slicecmds(de, s->slice_idx); new_slice_segment(de, s); new_entry_point(de, s, !s->dependent_slice_segment_flag, reset_qp_y, - ctb_addr_ts); - - for (i = 0; i < s->sh->num_entry_point_offsets; i++) { - int ctb_addr_rs = s->ctb_addr_ts_to_rs[ctb_addr_ts]; - int ctb_col = ctb_addr_rs % de->pic_width_in_ctbs_y; - int ctb_row = ctb_addr_rs / de->pic_width_in_ctbs_y; - int tile_x = ctb_to_tile(ctb_col, s->col_bd, - s->num_tile_columns - 1); - int tile_y = - ctb_to_tile(ctb_row, s->row_bd, s->num_tile_rows - 1); - int last_x = s->col_bd[tile_x + 1] - 1; - int last_y = s->row_bd[tile_y + 1] - 1; + PAUSE_MODE_TILE, + tile_x, tile_y, s->start_ctb_x, s->start_ctb_y, + s->slice_qp, slice_reg_const(s)); + /* + * If this is the last slice then fill in the other tile entries + * now, otherwise this will be done at the start of the next slice + * when it will be known where this slice finishes + */ + if (s->frame_end) { + tile_entry_fill(de, s, + s->tile_width - 1, + s->tile_height - 1); p1_apb_write(de, RPI_STATUS, - 2 + (last_x << 5) + (last_y << 18)); - write_prob(de, s); - ctb_addr_ts += s->column_width[tile_x] * s->row_height[tile_y]; - new_entry_point(de, s, 0, 1, ctb_addr_ts); + 1 | ((s->ctb_width - 1) << 5) | + ((s->ctb_height - 1) << 18)); } } @@ -1132,13 +1159,12 @@ static void decode_slice(struct rpivid_d // Scaling factors static void expand_scaling_list(const unsigned int size_id, - const unsigned int matrix_id, u8 *const dst0, + u8 *const dst0, const u8 *const src0, uint8_t dc) { u8 *d; unsigned int x, y; - // FIXME: matrix_id is unused ? switch (size_id) { case 0: memcpy(dst0, src0, 16); @@ -1199,24 +1225,20 @@ static void populate_scaling_factors(con unsigned int mid; for (mid = 0; mid < 6; mid++) - expand_scaling_list(0, mid, - de->scaling_factors + + expand_scaling_list(0, de->scaling_factors + scaling_factor_offsets[0][mid], sl->scaling_list_4x4[mid], 0); for (mid = 0; mid < 6; mid++) - expand_scaling_list(1, mid, - de->scaling_factors + + expand_scaling_list(1, de->scaling_factors + scaling_factor_offsets[1][mid], sl->scaling_list_8x8[mid], 0); for (mid = 0; mid < 6; mid++) - expand_scaling_list(2, mid, - de->scaling_factors + + expand_scaling_list(2, de->scaling_factors + scaling_factor_offsets[2][mid], sl->scaling_list_16x16[mid], sl->scaling_list_dc_coef_16x16[mid]); - for (mid = 0; mid < 2; mid += 1) - expand_scaling_list(3, mid, - de->scaling_factors + + for (mid = 0; mid < 2; mid++) + expand_scaling_list(3, de->scaling_factors + scaling_factor_offsets[3][mid], sl->scaling_list_32x32[mid], sl->scaling_list_dc_coef_32x32[mid]); @@ -1228,8 +1250,6 @@ static void free_ps_info(struct rpivid_d s->ctb_addr_rs_to_ts = NULL; kfree(s->ctb_addr_ts_to_rs); s->ctb_addr_ts_to_rs = NULL; - kfree(s->tile_id); - s->tile_id = NULL; kfree(s->col_bd); s->col_bd = NULL; @@ -1237,10 +1257,52 @@ static void free_ps_info(struct rpivid_d s->row_bd = NULL; } +static unsigned int tile_width(const struct rpivid_dec_state *const s, + const unsigned int t_x) +{ + return s->col_bd[t_x + 1] - s->col_bd[t_x]; +} + +static unsigned int tile_height(const struct rpivid_dec_state *const s, + const unsigned int t_y) +{ + return s->row_bd[t_y + 1] - s->row_bd[t_y]; +} + +static void fill_rs_to_ts(struct rpivid_dec_state *const s) +{ + unsigned int ts = 0; + unsigned int t_y; + unsigned int tr_rs = 0; + + for (t_y = 0; t_y != s->tile_height; ++t_y) { + const unsigned int t_h = tile_height(s, t_y); + unsigned int t_x; + unsigned int tc_rs = tr_rs; + + for (t_x = 0; t_x != s->tile_width; ++t_x) { + const unsigned int t_w = tile_width(s, t_x); + unsigned int y; + unsigned int rs = tc_rs; + + for (y = 0; y != t_h; ++y) { + unsigned int x; + + for (x = 0; x != t_w; ++x) { + s->ctb_addr_rs_to_ts[rs + x] = ts; + s->ctb_addr_ts_to_rs[ts] = rs + x; + ++ts; + } + rs += s->ctb_width; + } + tc_rs += t_w; + } + tr_rs += t_h * s->ctb_width; + } +} + static int updated_ps(struct rpivid_dec_state *const s) { - unsigned int ctb_addr_rs; - int j, x, y, tile_id; unsigned int i; free_ps_info(s); @@ -1259,104 +1321,49 @@ static int updated_ps(struct rpivid_dec_ // Inferred parameters + s->ctb_addr_rs_to_ts = kmalloc_array(s->ctb_size, + sizeof(*s->ctb_addr_rs_to_ts), + GFP_KERNEL); + s->ctb_addr_ts_to_rs = kmalloc_array(s->ctb_size, + sizeof(*s->ctb_addr_ts_to_rs), + GFP_KERNEL); + if (!(s->pps.flags & V4L2_HEVC_PPS_FLAG_TILES_ENABLED)) { - s->num_tile_columns = 1; - s->num_tile_rows = 1; - s->column_width[0] = s->ctb_width; - s->row_height[0] = s->ctb_height; + s->tile_width = 1; + s->tile_height = 1; } else { - s->num_tile_columns = s->pps.num_tile_columns_minus1 + 1; - s->num_tile_rows = s->pps.num_tile_rows_minus1 + 1; - for (i = 0; i < s->num_tile_columns; ++i) - s->column_width[i] = s->pps.column_width_minus1[i] + 1; - for (i = 0; i < s->num_tile_rows; ++i) - s->row_height[i] = s->pps.row_height_minus1[i] + 1; + s->tile_width = s->pps.num_tile_columns_minus1 + 1; + s->tile_height = s->pps.num_tile_rows_minus1 + 1; } - s->col_bd = kmalloc((s->num_tile_columns + 1) * sizeof(*s->col_bd), + s->col_bd = kmalloc((s->tile_width + 1) * sizeof(*s->col_bd), GFP_KERNEL); - s->row_bd = kmalloc((s->num_tile_rows + 1) * sizeof(*s->row_bd), + s->row_bd = kmalloc((s->tile_height + 1) * sizeof(*s->row_bd), GFP_KERNEL); s->col_bd[0] = 0; - for (i = 0; i < s->num_tile_columns; i++) - s->col_bd[i + 1] = s->col_bd[i] + s->column_width[i]; + for (i = 1; i < s->tile_width; i++) + s->col_bd[i] = s->col_bd[i - 1] + + s->pps.column_width_minus1[i - 1] + 1; + s->col_bd[s->tile_width] = s->ctb_width; s->row_bd[0] = 0; - for (i = 0; i < s->num_tile_rows; i++) - s->row_bd[i + 1] = s->row_bd[i] + s->row_height[i]; + for (i = 1; i < s->tile_height; i++) + s->row_bd[i] = s->row_bd[i - 1] + + s->pps.row_height_minus1[i - 1] + 1; + s->row_bd[s->tile_height] = s->ctb_height; - s->ctb_addr_rs_to_ts = kmalloc_array(s->ctb_size, - sizeof(*s->ctb_addr_rs_to_ts), - GFP_KERNEL); - s->ctb_addr_ts_to_rs = kmalloc_array(s->ctb_size, - sizeof(*s->ctb_addr_ts_to_rs), - GFP_KERNEL); - s->tile_id = kmalloc_array(s->ctb_size, sizeof(*s->tile_id), - GFP_KERNEL); - - for (ctb_addr_rs = 0; ctb_addr_rs < s->ctb_size; ctb_addr_rs++) { - int tb_x = ctb_addr_rs % s->ctb_width; - int tb_y = ctb_addr_rs / s->ctb_width; - int tile_x = 0; - int tile_y = 0; - int val = 0; - - for (i = 0; i < s->num_tile_columns; i++) { - if (tb_x < s->col_bd[i + 1]) { - tile_x = i; - break; - } - } - - for (i = 0; i < s->num_tile_rows; i++) { - if (tb_y < s->row_bd[i + 1]) { - tile_y = i; - break; - } - } - - for (i = 0; i < tile_x; i++) - val += s->row_height[tile_y] * s->column_width[i]; - for (i = 0; i < tile_y; i++) - val += s->ctb_width * s->row_height[i]; - - val += (tb_y - s->row_bd[tile_y]) * s->column_width[tile_x] + - tb_x - s->col_bd[tile_x]; - - s->ctb_addr_rs_to_ts[ctb_addr_rs] = val; - s->ctb_addr_ts_to_rs[val] = ctb_addr_rs; - } - - for (j = 0, tile_id = 0; j < s->num_tile_rows; j++) - for (i = 0; i < s->num_tile_columns; i++, tile_id++) - for (y = s->row_bd[j]; y < s->row_bd[j + 1]; y++) - for (x = s->col_bd[i]; - x < s->col_bd[i + 1]; - x++) - s->tile_id[s->ctb_addr_rs_to_ts - [y * s->ctb_width + - x]] = tile_id; + fill_rs_to_ts(s); return 0; } -static int frame_end(struct rpivid_dev *const dev, - struct rpivid_dec_env *const de, - const struct rpivid_dec_state *const s) -{ - const unsigned int last_x = s->col_bd[s->num_tile_columns] - 1; - const unsigned int last_y = s->row_bd[s->num_tile_rows] - 1; - size_t cmd_size; - - if (s->pps.flags & V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED) { - if (de->wpp_entry_x < 2 && de->pic_width_in_ctbs_y > 2) - wpp_pause(de, last_y); - } - p1_apb_write(de, RPI_STATUS, 1 + (last_x << 5) + (last_y << 18)); - +static int write_cmd_buffer(struct rpivid_dev *const dev, + struct rpivid_dec_env *const de, + const struct rpivid_dec_state *const s) +{ // Copy commands out to dma buf - cmd_size = de->cmd_len * sizeof(de->cmd_fifo[0]); + const size_t cmd_size = de->cmd_len * sizeof(de->cmd_fifo[0]); if (!de->cmd_copy_gptr->ptr || cmd_size > de->cmd_copy_gptr->size) { size_t cmd_alloc = round_up_size(cmd_size); @@ -1521,18 +1528,19 @@ static void rpivid_h265_setup(struct rpi struct rpivid_q_aux *dpb_q_aux[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; struct rpivid_dec_state *const s = ctx->state; struct vb2_queue *vq; - struct rpivid_dec_env *de; - int ctb_addr_ts; + struct rpivid_dec_env *de = ctx->dec0; + unsigned int prev_rs; unsigned int i; int use_aux; bool slice_temporal_mvp; + xtrace_in(dev, de); + pred_weight_table = &sh->pred_weight_table; s->frame_end = ((run->src->flags & V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF) == 0); - de = ctx->dec0; slice_temporal_mvp = (sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED); @@ -1662,6 +1670,13 @@ static void rpivid_h265_setup(struct rpi s->sps.pic_height_in_luma_samples); goto fail; } + if ((s->tile_width != 1 || s->tile_height != 1) && + (s->pps.flags & + V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED)) { + v4l2_warn(&dev->v4l2_dev, + "Tiles + WPP not supported\n"); + goto fail; + } // Fill in ref planes with our address s.t. if we mess // up refs somehow then we still have a valid address @@ -1760,15 +1775,24 @@ static void rpivid_h265_setup(struct rpi if (s->sps.flags & V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED) populate_scaling_factors(run, de, s); - ctb_addr_ts = s->ctb_addr_rs_to_ts[sh->slice_segment_addr]; + // Calc all the random coord info to avoid repeated conversion in/out + s->start_ts = s->ctb_addr_rs_to_ts[sh->slice_segment_addr]; + s->start_ctb_x = sh->slice_segment_addr % de->pic_width_in_ctbs_y; + s->start_ctb_y = sh->slice_segment_addr / de->pic_width_in_ctbs_y; + // Last CTB of previous slice + prev_rs = !s->start_ts ? 0 : s->ctb_addr_ts_to_rs[s->start_ts - 1]; + s->prev_ctb_x = prev_rs % de->pic_width_in_ctbs_y; + s->prev_ctb_y = prev_rs / de->pic_width_in_ctbs_y; if ((s->pps.flags & V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED)) - wpp_decode_slice(de, s, sh, ctb_addr_ts); + wpp_decode_slice(de, s); else - decode_slice(de, s, sh, ctb_addr_ts); + decode_slice(de, s); - if (!s->frame_end) + if (!s->frame_end) { + xtrace_ok(dev, de); return; + } // Frame end memset(dpb_q_aux, 0, @@ -1776,8 +1800,9 @@ static void rpivid_h265_setup(struct rpi /* * Need Aux ents for all (ref) DPB ents if temporal MV could * be enabled for any pic - * ** At the moment we have aux ents for all pics whether or not - * they are ref + * ** At the moment we create aux ents for all pics whether or not + * they are ref - they should then be discarded by the DPB-aux + * garbage collection code */ use_aux = ((s->sps.flags & V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED) != 0); @@ -1795,7 +1820,7 @@ static void rpivid_h265_setup(struct rpi } // v4l2_info(&dev->v4l2_dev, "rpivid_h265_end of frame\n"); - if (frame_end(dev, de, s)) + if (write_cmd_buffer(dev, de, s)) goto fail; for (i = 0; i < sh->num_active_dpb_entries; ++i) { @@ -1876,6 +1901,7 @@ static void rpivid_h265_setup(struct rpi } de->state = RPIVID_DECODE_PHASE1; + xtrace_ok(dev, de); return; fail: @@ -1883,6 +1909,7 @@ fail: // Actual error reporting happens in Trigger de->state = s->frame_end ? RPIVID_DECODE_ERROR_DONE : RPIVID_DECODE_ERROR_CONTINUE; + xtrace_fail(dev, de); } ////////////////////////////////////////////////////////////////////////////// @@ -2210,6 +2237,10 @@ static int rpivid_h265_start(struct rpiv size_t pu_alloc; size_t coeff_alloc; +#if DEBUG_TRACE_P1_CMD + p1_z = 0; +#endif + // Generate a sanitised WxH for memory alloc // Assume HD if unset if (w == 0)