24 #include "../ops_internal.h"
25 #include "../swscale_internal.h"
29 #if HAVE_SPIRV_HEADERS_SPIRV_H || HAVE_SPIRV_UNIFIED1_SPIRV_H
37 #if CONFIG_LIBSHADERC || CONFIG_LIBGLSLANG
39 s->spvc->uninit(&
s->spvc);
57 if (
s->vkctx.device_ref &&
s->vkctx.device_ref->data != dev_ref->
data) {
60 }
else if (
s->vkctx.device_ref &&
s->vkctx.device_ref->data == dev_ref->
data) {
74 #if CONFIG_LIBSHADERC || CONFIG_LIBGLSLANG
76 s->spvc = ff_vk_spirv_init();
89 return s ?
s->vkctx.device_ref :
NULL;
92 #define MAX_DITHER_BUFS 4
93 #define MAX_FILT_BUFS 4
94 #define MAX_DATA_BUFS (MAX_DITHER_BUFS + MAX_FILT_BUFS*4)
117 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
118 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT);
120 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
121 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT);
129 0, 0, VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
131 0, 1, VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
134 VkImageMemoryBarrier2 img_bar[8];
136 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
137 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
138 VK_ACCESS_SHADER_READ_BIT,
139 VK_IMAGE_LAYOUT_GENERAL,
140 VK_QUEUE_FAMILY_IGNORED);
142 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
143 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
144 VK_ACCESS_SHADER_WRITE_BIT,
145 VK_IMAGE_LAYOUT_GENERAL,
146 VK_QUEUE_FAMILY_IGNORED);
147 vk->CmdPipelineBarrier2(ec->buf, &(VkDependencyInfo) {
148 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
149 .pImageMemoryBarriers = img_bar,
150 .imageMemoryBarrierCount = nb_img_bar,
155 vk->CmdDispatch(ec->buf,
156 FFALIGN(dst_f->width,
p->shd.lg_size[0])/
p->shd.lg_size[0],
157 FFALIGN(dst_f->height,
p->shd.lg_size[1])/
p->shd.lg_size[1],
169 for (
int i = 0;
i <
p->nb_data_bufs;
i++)
184 VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
185 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
186 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
192 (uint8_t **)&weights_data, 0);
218 VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
219 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
220 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
229 for (
int i = 0;
i <
size;
i++) {
230 for (
int j = 0; j <
size; j++) {
249 for (
int n = 0; n < ops->
num_ops; n++) {
254 &
p->data_bufs[
p->nb_data_bufs]);
261 &
p->data_bufs[
p->nb_data_bufs]);
269 &
p->data_bufs[
p->nb_data_bufs]);
279 for (
int i = 0;
i <
p->nb_data_bufs;
i++)
284 #if HAVE_SPIRV_HEADERS_SPIRV_H || HAVE_SPIRV_UNIFIED1_SPIRV_H
295 typedef struct SPIRVIDs {
329 int linear_deco_off[16];
330 int linear_deco_ops[16];
334 int dither_ptr_elem_id;
338 int out_img_array_id;
354 SPICtx *spi, SPIRVIDs *
id)
373 id->in_vars, 3 +
id->nb_dither_bufs);
379 SpvBuiltInGlobalInvocationId);
391 for (
int i = 0;
i <
id->nb_dither_bufs;
i++) {
395 id->dither[
i].size*
sizeof(
float));
406 for (
int n = 0; n < ops->
num_ops; n++) {
413 for (
int j = 0; j < 4; j++) {
414 nb_ops += !!
op->lin.m[j][0].num;
415 nb_ops +=
op->lin.m[j][0].num &&
op->lin.m[j][4].num;
416 for (
int i = 1;
i < 4;
i++) {
417 nb_ops += !!
op->lin.m[j][
i].num;
418 nb_ops +=
op->lin.m[j][
i].num &&
419 (
op->lin.m[j][0].num ||
op->lin.m[j][4].num);
423 id->linear_deco_off[
id->nb_linear_ops] =
spi_reserve(spi, nb_ops*4*3);
424 id->linear_deco_ops[
id->nb_linear_ops] = nb_ops;
436 id->u32_type = spi_OpTypeInt(spi, 32, 0);
437 id->i32_type = spi_OpTypeInt(spi, 32, 1);
439 id->f32_type = spi_OpTypeFloat(spi, 32);
443 id->bvec2_type = spi_OpTypeVector(spi,
id->b_type, 2);
444 id->u32vec2_type = spi_OpTypeVector(spi, u32_type, 2);
445 id->i32vec2_type = spi_OpTypeVector(spi,
id->i32_type, 2);
447 id->u32vec3_type = spi_OpTypeVector(spi, u32_type, 3);
449 id->u32vec4_type = spi_OpTypeVector(spi, u32_type, 4);
450 id->f32vec4_type = spi_OpTypeVector(spi, f32_type, 4);
451 id->f32mat4_type = spi_OpTypeMatrix(spi,
id->f32vec4_type, 4);
457 for (
int i = 0;
i < 5;
i++)
461 id->nb_const_ids = 0;
462 for (
int n = 0; n < ops->
num_ops; n++) {
474 id->const_ids[
id->nb_const_ids++] =
tmp;
478 for (
int i = 0;
i < 4;
i++) {
484 id->const_ids[
id->nb_const_ids++] =
488 id->const_ids[
id->nb_const_ids++] =
498 id->const_ids[
id->nb_const_ids++] =
tmp;
504 float q =
op->scale.factor.num/(
float)
op->scale.factor.den;
514 id->const_ids[
id->nb_const_ids++] =
tmp;
519 for (
int i = 0;
i < 4;
i++) {
522 if (!
op->clamp.limit[
i].den) {
531 id->const_ids[
id->nb_const_ids++] =
tmp;
535 for (
int i = 0;
i < 4;
i++) {
536 if (
op->dither.y_offset[
i] < 0)
539 id->const_ids[
id->nb_const_ids++] =
tmp;
545 for (
int i = 0;
i < 4;
i++) {
546 for (
int j = 0; j < 4; j++) {
550 id->const_ids[
id->nb_const_ids++] =
554 id->const_ids[
id->nb_const_ids - 4],
555 id->const_ids[
id->nb_const_ids - 3],
556 id->const_ids[
id->nb_const_ids - 2],
557 id->const_ids[
id->nb_const_ids - 1]);
558 id->const_ids[
id->nb_const_ids++] =
tmp;
562 id->const_ids[
id->nb_const_ids - 5*4 + 4],
563 id->const_ids[
id->nb_const_ids - 5*3 + 4],
564 id->const_ids[
id->nb_const_ids - 5*2 + 4],
565 id->const_ids[
id->nb_const_ids - 5*1 + 4]);
566 id->const_ids[
id->nb_const_ids++] =
tmp;
568 for (
int i = 0;
i < 4;
i++) {
570 id->const_ids[
id->nb_const_ids++] =
575 id->const_ids[
id->nb_const_ids - 4],
576 id->const_ids[
id->nb_const_ids - 3],
577 id->const_ids[
id->nb_const_ids - 2],
578 id->const_ids[
id->nb_const_ids - 1]);
579 id->const_ids[
id->nb_const_ids++] =
tmp;
589 static void define_shader_bindings(
SwsOpList *ops,
SPICtx *spi, SPIRVIDs *
id,
590 int in_img_count,
int out_img_count)
595 struct DitherData *
dither =
id->dither;
596 for (
int i = 0;
i <
id->nb_dither_bufs;
i++) {
605 SpvStorageClassUniform, 0);
614 id->f32_type :
id->u32_type,
615 2, 0, 0, 0, 2, SpvImageFormatUnknown);
617 id->u32_cid[out_img_count]);
620 id->in_img_array_id = 0;
626 id->in_img_type = match ?
id->out_img_type :
629 id->f32_type :
id->u32_type,
630 2, 0, 0, 0, 2, SpvImageFormatUnknown);
632 id->u32_cid[in_img_count]);
639 id->out_img_array_id);
647 id->in_img_array_id);
654 SpvStorageClassInput, 0);
657 SpvStorageClassUniformConstant, 0);
660 SpvStorageClassUniformConstant, 0);
663 static int insert_vmat_linear(
const SwsOp *
op,
SPICtx *spi, SPIRVIDs *
id,
664 int data,
int const_off)
666 data = spi_OpMatrixTimesVector(spi,
id->f32vec4_type,
667 id->const_ids[const_off + 4*5],
669 return spi_OpFAdd(spi,
id->f32vec4_type,
670 id->const_ids[const_off + 4*5 + 1 + 4],
data);
673 static int insert_bitexact_linear(
const SwsOp *
op,
SPICtx *spi, SPIRVIDs *
id,
674 int data,
int linear_ops_idx,
int const_off)
677 int type_v =
op->type ==
SWS_PIXEL_F32 ?
id->f32vec4_type :
id->u32vec4_type;
686 spi->
off =
id->linear_deco_off[linear_ops_idx];
687 for (
int i = 0;
i <
id->linear_deco_ops[linear_ops_idx];
i++)
692 for (
int j = 0; j < 4; j++) {
694 if (
op->lin.m[j][0].num)
695 res[j] = spi_OpFMul(spi, type_s,
tmp[0],
696 id->const_ids[const_off + j*5 + 0]);
698 if (
op->lin.m[j][0].num &&
op->lin.m[j][4].num)
699 res[j] = spi_OpFAdd(spi, type_s,
700 id->const_ids[const_off + 4*5 + 1 + j], res[j]);
701 else if (
op->lin.m[j][4].num)
702 res[j] =
id->const_ids[const_off + 4*5 + 1 + j];
704 for (
int i = 1;
i < 4;
i++) {
705 if (!
op->lin.m[j][
i].num)
708 int v = spi_OpFMul(spi, type_s,
tmp[
i],
709 id->const_ids[const_off + j*5 +
i]);
710 if (
op->lin.m[j][0].num ||
op->lin.m[j][4].num)
711 res[j] = spi_OpFAdd(spi, type_s, res[j], v);
718 res[0], res[1], res[2], res[3]);
724 uint8_t spvbuf[1024*16];
725 SPICtx spi_context = { 0 }, *spi = &spi_context;
726 SPIRVIDs spid_data = { 0 }, *
id = &spid_data;
727 spi_init(spi, spvbuf,
sizeof(spvbuf));
734 (uint32_t []) { 32, 32, 1 }, 0);
749 .
type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
750 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
754 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
755 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
772 id->nb_dither_bufs = 0;
773 for (
int n = 0; n < ops->
num_ops; n++) {
778 id->
dither[
id->nb_dither_bufs].size = 1 <<
op->dither.size_log2;
779 id->dither[
id->nb_dither_bufs].arr_1d_id =
spi_get_id(spi);
780 id->dither[
id->nb_dither_bufs].arr_2d_id =
spi_get_id(spi);
781 id->dither[
id->nb_dither_bufs].struct_id =
spi_get_id(spi);
782 id->dither[
id->nb_dither_bufs].id =
spi_get_id(spi);
783 id->in_vars[3 +
id->nb_dither_bufs] =
id->dither[
id->nb_dither_bufs].id;
786 .
type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
787 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
790 if (
id->nb_dither_bufs)
792 id->nb_dither_bufs, 1, 0);
795 define_shader_header(sws, shd, ops, spi,
id);
796 define_shader_consts(sws, ops, spi,
id);
797 define_shader_bindings(ops, spi,
id, in_img_count, out_img_count);
804 int in_img[4] = { 0 };
805 for (
int i = 0;
i < in_img_count;
i++) {
808 id->in_vars[1],
id->u32_cid[
i]);
810 SpvMemoryAccessMaskNone, 0);
815 for (
int i = 0;
i < out_img_count;
i++) {
817 id->in_vars[2],
id->u32_cid[
i]);
819 SpvMemoryAccessMaskNone, 0);
824 SpvMemoryAccessMaskNone, 0);
828 int gi2 = spi_OpBitcast(spi,
id->i32vec2_type, gid);
831 int img1_s = spi_OpImageQuerySize(spi,
id->i32vec2_type, out_img[0]);
832 int scmp = spi_OpSGreaterThanEqual(spi,
id->bvec2_type, gi2, img1_s);
833 scmp = spi_OpAny(spi,
id->b_type, scmp);
848 id->f32_p,
id->f32_p,
849 id->f32_p,
id->f32_p);
852 id->u32_p,
id->u32_p,
853 id->u32_p,
id->u32_p);
856 int nb_const_ids = 0;
857 int nb_dither_bufs = 0;
858 int nb_linear_ops = 0;
861 for (
int n = 0; n < ops->
num_ops; n++) {
864 op->convert.to :
op->type;
866 id->f32vec4_type :
id->u32vec4_type;
868 id->f32_type :
id->u32_type;
870 id->f32_p :
id->u32_p;
874 if (
op->rw.frac ||
op->rw.filter) {
876 }
else if (
op->rw.packed) {
878 gid, SpvImageOperandsMaskNone);
881 for (
int i = 0;
i <
op->rw.elems;
i++) {
884 SpvImageOperandsMaskNone);
892 if (
op->rw.frac ||
op->rw.filter) {
894 }
else if (
op->rw.packed) {
896 SpvImageOperandsMaskNone);
898 for (
int i = 0;
i <
op->rw.elems;
i++) {
902 SpvImageOperandsMaskNone);
907 for (
int i = 0;
i < 4;
i++) {
908 if (!
op->clear.value[
i].den)
911 id->const_ids[nb_const_ids++],
924 data = spi_OpIMul(spi, type_v,
data,
id->const_ids[nb_const_ids++]);
926 data = spi_OpConvertFToU(spi, type_v,
data);
928 data = spi_OpConvertUToF(spi, type_v,
data);
931 data = spi_OpShiftLeftLogical(spi, type_v,
data,
932 id->const_ids[nb_const_ids++]);
935 data = spi_OpShiftRightLogical(spi, type_v,
data,
936 id->const_ids[nb_const_ids++]);
940 data = spi_OpFMul(spi, type_v,
data,
941 id->const_ids[nb_const_ids++]);
943 data = spi_OpIMul(spi, type_v,
data,
944 id->const_ids[nb_const_ids++]);
949 op->op ==
SWS_OP_MIN ? GLSLstd450FMin : GLSLstd450FMax :
951 for (
int i = 0;
i < 4;
i++) {
952 if (!
op->clamp.limit[
i].den)
956 tmp,
id->const_ids[nb_const_ids++]);
962 int did = nb_dither_bufs++;
965 x_id = spi_OpBitwiseAnd(spi,
id->u32_type, x_id,
966 id->dither[did].mask_id);
967 for (
int i = 0;
i < 4;
i++) {
968 if (
op->dither.y_offset[
i] < 0)
971 int y_id = spi_OpIAdd(spi,
id->u32_type, y_pos,
972 id->const_ids[nb_const_ids++]);
973 y_id = spi_OpBitwiseAnd(spi,
id->u32_type, y_id,
974 id->dither[did].mask_id);
977 id->dither[did].id,
id->u32_cid[0],
980 SpvMemoryAccessMaskNone, 0);
990 data = insert_bitexact_linear(
op, spi,
id,
data, nb_linear_ops, nb_const_ids);
992 data = insert_vmat_linear(
op, spi,
id,
data, nb_const_ids);
994 nb_const_ids += 5*5 + 1;
1026 #if CONFIG_LIBSHADERC || CONFIG_LIBGLSLANG
1038 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
1039 .mem_layout = img_type,
1043 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
1049 #define QSTR "(%i/%i%s)"
1050 #define QTYPE(Q) (Q).num, (Q).den, cur_type == SWS_PIXEL_F32 ? ".0f" : ""
1053 int idx,
const char *type_name,
1054 const char *type_v,
const char *type_s)
1057 if (
op->rw.filter) {
1062 av_bprintf(&shd->
src,
" int o = filter_o%i[%s];\n", idx, axis);
1067 if (
op->rw.packed) {
1068 GLSLF(2,
tmp +=
w * %
s(imageLoad(src_img[%
i], ivec2(%
s, %
s))); ,
1069 type_v, ops->
plane_src[0], coord_x, coord_y);
1071 for (
int i = 0;
i <
op->rw.elems;
i++)
1073 tmp.%
c +=
w * %
s(imageLoad(src_img[%
i], ivec2(%
s, %
s))[0]); ,
1074 "xyzw"[
i], type_s, ops->
plane_src[
i], coord_x, coord_y);
1079 if (
op->rw.packed) {
1083 for (
int i = 0;
i <
op->rw.elems;
i++)
1096 void *spv_opaque =
NULL;
1103 VK_SHADER_STAGE_COMPUTE_BIT,
1104 NULL, 0, 32, 32, 1, 0);
1114 add_desc_read_write(&
buf_desc[nb_desc++], &
p->src_rep,
read);
1115 add_desc_read_write(&
buf_desc[nb_desc++], &
p->dst_rep, write);
1125 for (
int n = 0; n < ops->
num_ops; n++) {
1128 int size = (1 <<
op->dither.size_log2);
1130 snprintf(data_buf_name[nb_desc], 256,
"dither_buf%i", n);
1131 snprintf(data_str_name[nb_desc], 256,
"float dither_mat%i[%i][%i];",
1134 .name = data_buf_name[nb_desc],
1135 .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
1136 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
1137 .mem_layout =
"scalar",
1138 .buf_content = data_str_name[nb_desc],
1146 op->rw.kernel :
op->filter.kernel;
1147 snprintf(data_buf_name[nb_desc], 256,
"filter_buf%i", n);
1148 snprintf(data_str_name[nb_desc], 256,
1149 "float filter_w%i[%i][%i];\n"
1150 " int filter_o%i[%i];",
1154 .name = data_buf_name[nb_desc],
1155 .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
1156 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
1157 .mem_layout =
"scalar",
1158 .buf_content = data_str_name[nb_desc],
1169 GLSLC(1, ivec2
pos = ivec2(gl_GlobalInvocationID.xy); );
1170 GLSLC(1, ivec2
size = imageSize(dst_img[0]); );
1174 GLSLC(1, u8vec4 u8; );
1175 GLSLC(1, u16vec4 u16; );
1176 GLSLC(1, u32vec4 u32; );
1177 GLSLC(1, precise f32vec4 f32; );
1181 for (
int n = 0; n < ops->
num_ops; n++) {
1186 const char *type_v = cur_type ==
SWS_PIXEL_F32 ?
"f32vec4" :
1198 read_glsl(ops,
op, shd, n, type_name, type_v, type_s);
1202 if (
op->rw.frac ||
op->rw.filter) {
1204 }
else if (
op->rw.packed) {
1208 for (
int i = 0;
i <
op->rw.elems;
i++)
1216 for (
int i = 0;
i < 4;
i++)
1222 for (
int i = 0;
i < 4;
i++) {
1226 "xyzw"[
i], type_s, QTYPE(
op->clear.value[
i]));
1232 type_name, type_name, QTYPE(
op->scale.factor));
1236 for (
int i = 0;
i < 4;
i++) {
1237 if (!
op->clamp.limit[
i].den)
1240 type_name,
"xyzw"[
i],
1242 type_name,
"xyzw"[
i], QTYPE(
op->clamp.limit[
i]));
1262 int size = (1 <<
op->dither.size_log2);
1263 for (
int i = 0;
i < 4;
i++) {
1264 if (
op->dither.y_offset[
i] < 0)
1266 av_bprintf(&shd->
src,
" %s.%c += dither_mat%i[(pos.y + %i) & %i]"
1268 type_name,
"xyzw"[
i], n,
1269 op->dither.y_offset[
i],
size - 1,
1275 for (
int i = 0;
i < 4;
i++) {
1276 if (
op->lin.m[
i][4].num)
1278 QTYPE(
op->lin.m[
i][4]));
1281 for (
int j = 0; j < 4; j++) {
1282 if (!
op->lin.m[
i][j].num)
1285 "xyzw"[
i],
"xyzw"[j], QTYPE(
op->lin.m[
i][j]));
1292 av_bprintf(&shd->
src,
" %s = %s.%s;\n", type_name, type_name,
1297 av_bprintf(&shd->
src,
" %s = %s.%s;\n", type_name, type_name,
1307 err =
s->spvc->compile_shader(&
s->vkctx,
s->spvc, shd,
1308 &spv_data, &spv_len,
"main",
1316 s->spvc->free_shader(
s->spvc, &spv_opaque);
1347 VkFormatProperties2 prop = {
1348 .sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2,
1351 vk->GetPhysicalDeviceFormatProperties2(
s->vkctx.hwctx->phys_dev,
1352 VK_FORMAT_B8G8R8A8_UNORM,
1354 if (!(prop.formatProperties.optimalTilingFeatures &
1355 VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT)) {
1363 #if CONFIG_LIBSHADERC || CONFIG_LIBGLSLANG
1364 err = add_ops_glsl(sws,
p,
s, ops, &
p->shd);
1368 #if HAVE_SPIRV_HEADERS_SPIRV_H || HAVE_SPIRV_UNIFIED1_SPIRV_H
1369 err = add_ops_spirv(sws,
p,
s, ops, &
p->shd);
1379 for (
int i = 0;
i <
p->nb_data_bufs;
i++)
1381 1,
i, 0, &
p->data_bufs[
i],
1382 0, VK_WHOLE_SIZE, VK_FORMAT_UNDEFINED);
1398 #if HAVE_SPIRV_HEADERS_SPIRV_H || HAVE_SPIRV_UNIFIED1_SPIRV_H
1406 .compile = compile_spirv,
1411 #if CONFIG_LIBSHADERC || CONFIG_LIBGLSLANG
1419 .compile = compile_glsl,