From 847d7b6980d19ef1197bdd4106b4ae9268e1abdc Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sun, 26 Sep 2021 23:45:44 +0200 Subject: [PATCH 1/5] Fix minor error in README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e9532d00..b3160832 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,11 @@ Install the following: #### Common ``` -sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev +sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev libevdev-dev ``` #### X11 ``` -sudo apt install libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev +sudo apt install libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev ``` #### KMS From 4177b020647131be59d85d8b19c8e3143e820b36 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Mon, 27 Sep 2021 17:58:35 +0200 Subject: [PATCH 2/5] Allow cuda kernel to run in parallell --- sunshine/platform/linux/cuda.cpp | 70 +++++++++++++++++++++++++------- sunshine/platform/linux/cuda.cu | 25 ++++++++++-- sunshine/platform/linux/cuda.h | 25 ++++++++---- sunshine/platform/linux/misc.cpp | 4 -- 4 files changed, 95 insertions(+), 29 deletions(-) diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index 8a25eb27..b96dcf17 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -57,6 +57,10 @@ inline static int check(CUresult result, const std::string_view &sv) { return 0; } +void freeStream(CUstream stream) { + CU_CHECK_IGNORE(cdf->cuStreamDestroy(stream), "Couldn't destroy cuda stream"); +} + class img_t : public platf::img_t { public: tex_t tex; @@ -95,7 +99,8 @@ public: this->hwframe.reset(frame); this->frame = frame; - if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) { + auto hwframe_ctx = (AVHWFramesContext *)frame->hw_frames_ctx->data; + if(hwframe_ctx->sw_format != AV_PIX_FMT_NV12) { BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv; return -1; } @@ -106,6 +111,15 @@ public: return -1; } + auto cuda_ctx = (AVCUDADeviceContext *)hwframe_ctx->device_ctx->hwctx; + + stream = make_stream(); + if(!stream) { + return -1; + } + + cuda_ctx->stream = stream.get(); + auto sws_opt = sws_t::make(width, height, frame->width, frame->height, width * 4); if(!sws_opt) { return -1; @@ -143,13 +157,14 @@ public: return; } - sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, { frame->width, frame->height, 0, 0 }); + sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, stream.get(), { frame->width, frame->height, 0, 0 }); } cudaTextureObject_t tex_obj(const tex_t &tex) const { return linear_interpolation ? tex.texture.linear : tex.texture.point; } + stream_t stream; frame_t hwframe; int width, height; @@ -163,7 +178,7 @@ public: class cuda_ram_t : public cuda_t { public: int convert(platf::img_t &img) override { - return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex)); + return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex), stream.get()); } int set_frame(AVFrame *frame) { @@ -187,7 +202,7 @@ public: class cuda_vram_t : public cuda_t { public: int convert(platf::img_t &img) override { - return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex)); + return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex), stream.get()); } }; @@ -257,6 +272,28 @@ int init() { return 0; } +class ctx_t { +public: + ctx_t(NVFBC_SESSION_HANDLE handle) { + NVFBC_BIND_CONTEXT_PARAMS params { NVFBC_BIND_CONTEXT_PARAMS_VER }; + + if(func.nvFBCBindContext(handle, ¶ms)) { + BOOST_LOG(error) << "Couldn't bind NvFBC context to current thread: " << func.nvFBCGetLastErrorStr(handle); + } + + this->handle = handle; + } + + ~ctx_t() { + NVFBC_RELEASE_CONTEXT_PARAMS params { NVFBC_RELEASE_CONTEXT_PARAMS_VER }; + if(func.nvFBCReleaseContext(handle, ¶ms)) { + BOOST_LOG(error) << "Couldn't release NvFBC context from current thread: " << func.nvFBCGetLastErrorStr(handle); + } + } + + NVFBC_SESSION_HANDLE handle; +}; + class handle_t { enum flag_e { SESSION_HANDLE, @@ -348,24 +385,26 @@ public: return 0; } - ~handle_t() { + int reset() { if(!handle_flags[SESSION_HANDLE]) { - return; + return 0; } - if(handle_flags[SESSION_CAPTURE]) { - NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER }; - - if(func.nvFBCDestroyCaptureSession(handle, ¶ms)) { - BOOST_LOG(error) << "Couldn't destroy capture session: "sv << func.nvFBCGetLastErrorStr(handle); - } - } + stop(); NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER }; if(func.nvFBCDestroyHandle(handle, ¶ms)) { BOOST_LOG(error) << "Couldn't destroy session handle: "sv << func.nvFBCGetLastErrorStr(handle); } + + handle_flags[SESSION_HANDLE] = false; + + return 0; + } + + ~handle_t() { + reset(); } std::bitset handle_flags; @@ -381,6 +420,8 @@ public: return -1; } + ctx_t ctx { handle->handle }; + auto status_params = handle->status(); if(!status_params) { return -1; @@ -443,8 +484,9 @@ public: // Force display_t::capture to initialize handle_t::capture cursor_visible = !*cursor; + ctx_t ctx { handle.handle }; auto fg = util::fail_guard([&]() { - handle.stop(); + handle.reset(); }); while(img) { diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index e93f7d9f..acf2d76d 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -101,6 +101,23 @@ void freeCudaPtr_t::operator()(void *ptr) { CU_CHECK_IGNORE(cudaFree(ptr), "Couldn't free cuda device pointer"); } +void freeCudaStream_t::operator()(cudaStream_t ptr) { + CU_CHECK_IGNORE(cudaStreamDestroy(ptr), "Couldn't free cuda stream"); +} + +stream_t make_stream(int flags) { + cudaStream_t stream; + + if(!flags) { + CU_CHECK_PTR(cudaStreamCreate(&stream), "Couldn't create cuda stream"); + } + else { + CU_CHECK_PTR(cudaStreamCreateWithFlags(&stream, flags), "Couldn't create cuda stream with flags"); + } + + return stream_t { stream }; +} + inline __device__ float3 bgra_to_rgb(uchar4 vec) { return make_float3((float)vec.z, (float)vec.y, (float)vec.x); } @@ -260,18 +277,18 @@ std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, i return std::make_unique(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr)); } -int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) { - return convert(Y, UV, pitchY, pitchUV, texture, viewport); +int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream) { + return convert(Y, UV, pitchY, pitchUV, texture, stream, viewport); } -int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport) { +int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport) { int threadsX = viewport.width / 2; int threadsY = viewport.height; dim3 block(threadsPerBlock); dim3 grid(div_align(threadsX, threadsPerBlock), threadsY); - RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get()); + RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get()); return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed"); } diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index e46b4759..7e81ae99 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -1,16 +1,17 @@ #if !defined(SUNSHINE_PLATFORM_CUDA_H) && defined(SUNSHINE_BUILD_CUDA) #define SUNSHINE_PLATFORM_CUDA_H -#include #include #include +#include namespace platf { - class hwdevice_t; - class img_t; -} +class hwdevice_t; +class img_t; +} // namespace platf namespace cuda { + namespace nvfbc { std::vector display_names(); } @@ -21,8 +22,10 @@ int init(); typedef struct cudaArray *cudaArray_t; #if !defined(__CUDACC__) +typedef struct CUstream_st *cudaStream_t; typedef unsigned long long cudaTextureObject_t; #else /* defined(__CUDACC__) */ +typedef __location__(device_builtin) struct CUstream_st *cudaStream_t; typedef __location__(device_builtin) unsigned long long cudaTextureObject_t; #endif /* !defined(__CUDACC__) */ @@ -33,7 +36,15 @@ public: void operator()(void *ptr); }; -using ptr_t = std::unique_ptr; +class freeCudaStream_t { +public: + void operator()(cudaStream_t ptr); +}; + +using ptr_t = std::unique_ptr; +using stream_t = std::unique_ptr; + +stream_t make_stream(int flags = 0); struct viewport_t { int width, height; @@ -75,8 +86,8 @@ public: static std::unique_ptr make(int in_width, int in_height, int out_width, int out_height, int pitch); // Converts loaded image into a CUDevicePtr - int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture); - int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport); + int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream); + int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport); void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range); diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp index dd114ec4..6d7643c0 100644 --- a/sunshine/platform/linux/misc.cpp +++ b/sunshine/platform/linux/misc.cpp @@ -264,13 +264,11 @@ std::unique_ptr init() { #endif #ifdef SUNSHINE_BUILD_CUDA if(verify_nvfbc()) { - BOOST_LOG(info) << "Using NvFBC for screencasting"sv; sources[source::NVFBC] = true; } #endif #ifdef SUNSHINE_BUILD_WAYLAND if(verify_wl()) { - BOOST_LOG(info) << "Using Wayland for screencasting"sv; sources[source::WAYLAND] = true; } #endif @@ -282,13 +280,11 @@ std::unique_ptr init() { display_cursor = false; } - BOOST_LOG(info) << "Using KMS for screencasting"sv; sources[source::KMS] = true; } #endif #ifdef SUNSHINE_BUILD_X11 if(verify_x11()) { - BOOST_LOG(info) << "Using X11 for screencasting"sv; sources[source::X11] = true; } #endif From 9f14b2278d7e77ab2980f44216ab782ebf21930f Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Mon, 27 Sep 2021 19:12:42 +0200 Subject: [PATCH 3/5] Fix rate control for hevc with amdvce --- sunshine/config.cpp | 35 +++++++++++++++++++++++++++++------ sunshine/config.h | 3 ++- sunshine/video.cpp | 8 ++++---- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/sunshine/config.cpp b/sunshine/config.cpp index 18083d9a..a35d5955 100644 --- a/sunshine/config.cpp +++ b/sunshine/config.cpp @@ -99,16 +99,22 @@ enum quality_e : int { _default = 0, speed, balanced, - //quality2, }; -enum rc_e : int { +enum class rc_hevc_e : int { constqp, /**< Constant QP mode */ vbr_latency, /**< Latency Constrained Variable Bitrate */ vbr_peak, /**< Peak Contrained Variable Bitrate */ cbr, /**< Constant bitrate mode */ }; +enum class rc_h264_e : int { + constqp, /**< Constant QP mode */ + cbr, /**< Constant bitrate mode */ + vbr_peak, /**< Peak Contrained Variable Bitrate */ + vbr_latency, /**< Latency Constrained Variable Bitrate */ +}; + enum coder_e : int { _auto = 0, cabac, @@ -120,15 +126,25 @@ std::optional quality_from_view(const std::string_view &quality) { if(quality == #x##sv) return x _CONVERT_(speed); _CONVERT_(balanced); - //_CONVERT_(quality2); if(quality == "default"sv) return _default; #undef _CONVERT_ return std::nullopt; } -std::optional rc_from_view(const std::string_view &rc) { +std::optional rc_h264_from_view(const std::string_view &rc) { #define _CONVERT_(x) \ - if(rc == #x##sv) return x + if(rc == #x##sv) return (int)rc_h264_e::x + _CONVERT_(constqp); + _CONVERT_(vbr_latency); + _CONVERT_(vbr_peak); + _CONVERT_(cbr); +#undef _CONVERT_ + return std::nullopt; +} + +std::optional rc_hevc_from_view(const std::string_view &rc) { +#define _CONVERT_(x) \ + if(rc == #x##sv) return (int)rc_hevc_e::x _CONVERT_(constqp); _CONVERT_(vbr_latency); _CONVERT_(vbr_peak); @@ -165,6 +181,7 @@ video_t video { { amd::balanced, std::nullopt, + std::nullopt, -1 }, // amd {}, // encoder @@ -659,8 +676,14 @@ void apply_config(std::unordered_map &&vars) { int_f(vars, "nv_coder", video.nv.coder, nv::coder_from_view); int_f(vars, "amd_quality", video.amd.quality, amd::quality_from_view); - int_f(vars, "amd_rc", video.amd.rc, amd::rc_from_view); + + std::string rc; + string_f(vars, "amd_rc", rc); int_f(vars, "amd_coder", video.amd.coder, amd::coder_from_view); + if(!rc.empty()) { + video.amd.rc_h264 = amd::rc_h264_from_view(rc); + video.amd.rc_hevc = amd::rc_hevc_from_view(rc); + } string_f(vars, "encoder", video.encoder); string_f(vars, "adapter_name", video.adapter_name); diff --git a/sunshine/config.h b/sunshine/config.h index 80d4052d..624e4225 100644 --- a/sunshine/config.h +++ b/sunshine/config.h @@ -29,7 +29,8 @@ struct video_t { struct { std::optional quality; - std::optional rc; + std::optional rc_h264; + std::optional rc_hevc; int coder; } amd; diff --git a/sunshine/video.cpp b/sunshine/video.cpp index 261d86bc..d52a5caa 100644 --- a/sunshine/video.cpp +++ b/sunshine/video.cpp @@ -459,19 +459,19 @@ static encoder_t amdvce { { "gops_per_idr"s, 30 }, { "usage"s, "ultralowlatency"s }, { "quality"s, &config::video.amd.quality }, - { "rc"s, &config::video.amd.rc }, + { "rc"s, &config::video.amd.rc_hevc }, }, - std::make_optional({ "qp"s, &config::video.qp }), + std::make_optional({ "qp_p"s, &config::video.qp }), "hevc_amf"s, }, { { { "usage"s, "ultralowlatency"s }, { "quality"s, &config::video.amd.quality }, - { "rc"s, &config::video.amd.rc }, + { "rc"s, &config::video.amd.rc_h264 }, { "log_to_dbg"s, "1"s }, }, - std::make_optional({ "qp"s, &config::video.qp }), + std::make_optional({ "qp_p"s, &config::video.qp }), "h264_amf"s, }, DEFAULT, From 1f7bdb1b2a19444b5f9394f237a191cb6cd4dc6f Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Mon, 27 Sep 2021 19:35:06 +0200 Subject: [PATCH 4/5] Fix segfault when multiple controllers connected --- sunshine/platform/linux/input.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sunshine/platform/linux/input.cpp b/sunshine/platform/linux/input.cpp index a0d18578..fbdc8133 100644 --- a/sunshine/platform/linux/input.cpp +++ b/sunshine/platform/linux/input.cpp @@ -696,7 +696,13 @@ public: }; inline void rumbleIterate(std::vector &effects, std::vector &polls, std::chrono::milliseconds to) { - auto res = poll(&polls.data()->el, polls.size(), to.count()); + std::vector polls_tmp; + polls_tmp.reserve(polls.size()); + for(auto &poll : polls) { + polls_tmp.emplace_back(poll.el); + } + + auto res = poll(polls_tmp.data(), polls.size(), to.count()); // If timed out if(!res) { @@ -871,7 +877,7 @@ void broadcastRumble(safe::queue_t &rumble_queue_queue) { } if(polls.empty()) { - std::this_thread::sleep_for(50ms); + std::this_thread::sleep_for(250ms); } else { rumbleIterate(effects, polls, 100ms); From e7cbfb3ee92ae65064936f4938702e2c02b5beb0 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Mon, 27 Sep 2021 19:54:32 +0200 Subject: [PATCH 5/5] Fix dependencies for debian bullseye --- gen-deb.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gen-deb.in b/gen-deb.in index 70da08d2..3cb64ea0 100755 --- a/gen-deb.in +++ b/gen-deb.in @@ -38,7 +38,7 @@ Architecture: amd64 Maintainer: @loki Priority: optional Version: 0.11.0 -Depends: libssl1.1, libavdevice58, libboost-thread1.67.0 | libboost-thread1.71.0, libboost-filesystem1.67.0 | libboost-filesystem1.71.0, libboost-log1.67.0 | libboost-log1.71.0, libpulse0, libopus0, libxcb-shm0, libxcb-xfixes0, libxtst6, libevdev2, libdrm2, libcap2 +Depends: libssl1.1, libavdevice58, libboost-thread1.67.0 | libboost-thread1.71.0 | libboost-thread1.74.0, libboost-filesystem1.67.0 | libboost-filesystem1.71.0 | libboost-filesystem1.74.0, libboost-log1.67.0 | libboost-log1.71.0 | libboost-log1.74.0, libpulse0, libopus0, libxcb-shm0, libxcb-xfixes0, libxtst6, libevdev2, libdrm2, libcap2 Description: Gamestream host for Moonlight EOF