From e36bde4acce48c21c79a57fb29727d96fdae6503 Mon Sep 17 00:00:00 2001
From: Cameron Gutman <aicommander@gmail.com>
Date: Sun, 2 Jul 2023 22:16:20 -0500
Subject: [PATCH] Implement AV1 codec support

This has a breaking change to StreamConfiguration that requires client updates.
---
 src/Connection.c        |   2 +-
 src/Limelight.h         |  59 ++++++++++++---------
 src/Misc.c              |   3 +-
 src/RtspConnection.c    |  28 +++++++---
 src/SdpGenerator.c      |  15 +++++-
 src/VideoDepacketizer.c | 114 ++++++++++++++++++++++++----------------
 6 files changed, 143 insertions(+), 78 deletions(-)

diff --git a/src/Connection.c b/src/Connection.c
index c7b9385..40b00c7 100644
--- a/src/Connection.c
+++ b/src/Connection.c
@@ -288,7 +288,7 @@ int LiStartConnection(PSERVER_INFORMATION serverInfo, PSTREAM_CONFIGURATION stre
     }
 
     // Dimensions over 4096 are only supported with HEVC on NVENC
-    if (!StreamConfig.supportsHevc &&
+    if (!(StreamConfig.supportedVideoFormats & ~VIDEO_FORMAT_MASK_H264) &&
             (StreamConfig.width > 4096 || StreamConfig.height > 4096)) {
         Limelog("WARNING: Streaming at resolutions above 4K using H.264 will likely fail! Trying anyway!\n");
     }
diff --git a/src/Limelight.h b/src/Limelight.h
index eae150c..8b890b1 100644
--- a/src/Limelight.h
+++ b/src/Limelight.h
@@ -61,14 +61,14 @@ typedef struct _STREAM_CONFIGURATION {
     // See AUDIO_CONFIGURATION constants and MAKE_AUDIO_CONFIGURATION() below.
     int audioConfiguration;
     
-    // Specifies that the client can accept an H.265 video stream
-    // if the server is able to provide one.
-    bool supportsHevc;
+    // Specifies the mask of supported video formats.
+    // See VIDEO_FORMAT constants below.
+    int supportedVideoFormats;
 
-    // Specifies that the client is requesting an HDR H.265 video stream.
+    // Specifies that the client is requesting an HDR video stream.
     //
     // This should only be set if:
-    // 1) The client decoder supports HEVC Main10 profile (supportsHevc must be set too)
+    // 1) The client decoder supports a 10-bit format (as set in supportedVideoFormats)
     // 2) The server has support for HDR as indicated by ServerCodecModeSupport in /serverinfo
     //
     // See ConnListenerSetHdrMode() for a callback to indicate when to set
@@ -81,6 +81,12 @@ typedef struct _STREAM_CONFIGURATION {
     // (or in addition to) improving image quality.
     int hevcBitratePercentageMultiplier;
 
+    // Specifies the percentage that the specified bitrate will be adjusted
+    // when an AV1 stream will be delivered. This allows clients to opt to
+    // reduce bandwidth when AV1 is chosen as the video codec rather than
+    // (or in addition to) improving image quality.
+    int av1BitratePercentageMultiplier;
+
     // If specified, the client's display refresh rate x 100. For example,
     // 59.94 Hz would be specified as 5994. This is used by recent versions
     // of GFE for enhanced frame pacing.
@@ -113,7 +119,8 @@ typedef struct _STREAM_CONFIGURATION {
 void LiInitializeStreamConfiguration(PSTREAM_CONFIGURATION streamConfig);
 
 // These identify codec configuration data in the buffer lists
-// of frames identified as IDR frames.
+// of frames identified as IDR frames for H.264 and HEVC formats.
+// For other codecs, all data is marked as BUFFER_TYPE_PICDATA.
 #define BUFFER_TYPE_PICDATA  0x00
 #define BUFFER_TYPE_SPS      0x01
 #define BUFFER_TYPE_PPS      0x02
@@ -129,7 +136,7 @@ typedef struct _LENTRY {
     // Size of data in bytes (never <= 0)
     int length;
 
-    // Buffer type (listed above)
+    // Buffer type (listed above, only set for H.264 and HEVC formats)
     int bufferType;
 } LENTRY, *PLENTRY;
 
@@ -137,10 +144,13 @@ typedef struct _LENTRY {
 // previous P-frames.
 #define FRAME_TYPE_PFRAME 0x00
 
-// Indicates this frame contains SPS, PPS, and VPS (if applicable)
-// as the first buffers in the list. Each NALU will appear as a separate
-// buffer in the buffer list. The I-frame data follows immediately
+// This is a key frame.
+//
+// For H.264 and HEVC, this means the frame contains SPS, PPS, and VPS (HEVC only) NALUs
+// as the first buffers in the list. The I-frame data follows immediately
 // after the codec configuration NALUs.
+//
+// For other codecs, any configuration data is not split into separate buffers.
 #define FRAME_TYPE_IDR    0x01
 
 // A decode unit describes a buffer chain of video data from multiple packets
@@ -219,22 +229,19 @@ typedef struct _DECODE_UNIT {
 // The maximum number of channels supported
 #define AUDIO_CONFIGURATION_MAX_CHANNEL_COUNT 8
 
-// Passed to DecoderRendererSetup to indicate that the following video stream will be
-// in H.264 High Profile.
-#define VIDEO_FORMAT_H264 0x0001
-
-// Passed to DecoderRendererSetup to indicate that the following video stream will be
-// in H.265 Main profile. This will only be passed if supportsHevc is true.
-#define VIDEO_FORMAT_H265 0x0100
-
-// Passed to DecoderRendererSetup to indicate that the following video stream will be
-// in H.265 Main10 (HDR10) profile. This will only be passed if enableHdr is true.
-#define VIDEO_FORMAT_H265_MAIN10 0x0200
+// Passed in StreamConfiguration.supportedVideoFormats to specify supported codecs
+// and to DecoderRendererSetup() to specify selected codec.
+#define VIDEO_FORMAT_H264        0x0001 // H.264 High Profile
+#define VIDEO_FORMAT_H265        0x0100 // HEVC Main Profile
+#define VIDEO_FORMAT_H265_MAIN10 0x0200 // HEVC Main10 Profile (requires enableHdr)
+#define VIDEO_FORMAT_AV1_MAIN8   0x1000 // AV1 Main 8-bit profile
+#define VIDEO_FORMAT_AV1_MAIN10  0x2000 // AV1 Main 10-bit profile (requires enableHdr)
 
 // Masks for clients to use to match video codecs without profile-specific details.
-#define VIDEO_FORMAT_MASK_H264  0x00FF
-#define VIDEO_FORMAT_MASK_H265  0xFF00
-#define VIDEO_FORMAT_MASK_10BIT 0x0200
+#define VIDEO_FORMAT_MASK_H264  0x000F
+#define VIDEO_FORMAT_MASK_H265  0x0F00
+#define VIDEO_FORMAT_MASK_AV1   0xF000
+#define VIDEO_FORMAT_MASK_10BIT 0x2200
 
 // If set in the renderer capabilities field, this flag will cause audio/video data to
 // be submitted directly from the receive thread. This should only be specified if the
@@ -268,6 +275,10 @@ typedef struct _DECODE_UNIT {
 // also providing a sample callback is not allowed.
 #define CAPABILITY_PULL_RENDERER 0x20
 
+// If set in the video renderer capabilities field, this flag specifies that the renderer
+// supports reference frame invalidation for AV1 streams. This flag is only valid on video renderers.
+#define CAPABILITY_REFERENCE_FRAME_INVALIDATION_AV1 0x40
+
 // If set in the video renderer capabilities field, this macro specifies that the renderer
 // supports slicing to increase decoding performance. The parameter specifies the desired
 // number of slices per frame. This capability is only valid on video renderers.
diff --git a/src/Misc.c b/src/Misc.c
index 43c0f55..8c3c553 100644
--- a/src/Misc.c
+++ b/src/Misc.c
@@ -123,7 +123,8 @@ bool isReferenceFrameInvalidationSupportedByDecoder(void) {
     LC_ASSERT(NegotiatedVideoFormat != 0);
 
     return ((NegotiatedVideoFormat & VIDEO_FORMAT_MASK_H264) && (VideoCallbacks.capabilities & CAPABILITY_REFERENCE_FRAME_INVALIDATION_AVC)) ||
-           ((NegotiatedVideoFormat & VIDEO_FORMAT_MASK_H265) && (VideoCallbacks.capabilities & CAPABILITY_REFERENCE_FRAME_INVALIDATION_HEVC));
+           ((NegotiatedVideoFormat & VIDEO_FORMAT_MASK_H265) && (VideoCallbacks.capabilities & CAPABILITY_REFERENCE_FRAME_INVALIDATION_HEVC)) ||
+           ((NegotiatedVideoFormat & VIDEO_FORMAT_MASK_AV1) && (VideoCallbacks.capabilities & CAPABILITY_REFERENCE_FRAME_INVALIDATION_AV1));
 }
 
 bool isReferenceFrameInvalidationEnabled(void) {
diff --git a/src/RtspConnection.c b/src/RtspConnection.c
index 08e5483..b4f6baa 100644
--- a/src/RtspConnection.c
+++ b/src/RtspConnection.c
@@ -904,13 +904,27 @@ int performRtspHandshake(PSERVER_INFORMATION serverInfo) {
             goto Exit;
         }
         
-        // The RTSP DESCRIBE reply will contain a collection of SDP media attributes that
-        // describe the various supported video stream formats and include the SPS, PPS,
-        // and VPS (if applicable). We will use this information to determine whether the
-        // server can support HEVC. For some reason, they still set the MIME type of the HEVC
-        // format to H264, so we can't just look for the HEVC MIME type. What we'll do instead is
-        // look for the base 64 encoded VPS NALU prefix that is unique to the HEVC bitstream.
-        if (StreamConfig.supportsHevc && strstr(response.payload, "sprop-parameter-sets=AAAAAU")) {
+        if ((StreamConfig.supportedVideoFormats & VIDEO_FORMAT_MASK_AV1) && strstr(response.payload, "a=rtpmap:200 AV1/90000")) {
+            if (StreamConfig.enableHdr) {
+                NegotiatedVideoFormat = VIDEO_FORMAT_AV1_MAIN10;
+            }
+            else {
+                NegotiatedVideoFormat = VIDEO_FORMAT_AV1_MAIN8;
+
+                // Apply bitrate adjustment for SDR AV1 if the client requested one
+                if (StreamConfig.av1BitratePercentageMultiplier != 0) {
+                    StreamConfig.bitrate *= StreamConfig.av1BitratePercentageMultiplier;
+                    StreamConfig.bitrate /= 100;
+                }
+            }
+        }
+        else if ((StreamConfig.supportedVideoFormats & VIDEO_FORMAT_MASK_H265) && strstr(response.payload, "sprop-parameter-sets=AAAAAU")) {
+            // The RTSP DESCRIBE reply will contain a collection of SDP media attributes that
+            // describe the various supported video stream formats and include the SPS, PPS,
+            // and VPS (if applicable). We will use this information to determine whether the
+            // server can support HEVC. For some reason, they still set the MIME type of the HEVC
+            // format to H264, so we can't just look for the HEVC MIME type. What we'll do instead is
+            // look for the base 64 encoded VPS NALU prefix that is unique to the HEVC bitstream.
             if (StreamConfig.enableHdr) {
                 NegotiatedVideoFormat = VIDEO_FORMAT_H265_MAIN10;
             }
diff --git a/src/SdpGenerator.c b/src/SdpGenerator.c
index 5821581..ee32c75 100644
--- a/src/SdpGenerator.c
+++ b/src/SdpGenerator.c
@@ -347,7 +347,20 @@ static PSDP_OPTION getAttributesList(char*urlSafeAddr) {
         sprintf(payloadStr, "%d", slicesPerFrame);
         err |= addAttributeString(&optionHead, "x-nv-video[0].videoEncoderSlicesPerFrame", payloadStr);
 
-        if (NegotiatedVideoFormat & VIDEO_FORMAT_MASK_H265) {
+        if (NegotiatedVideoFormat & VIDEO_FORMAT_MASK_AV1) {
+            err |= addAttributeString(&optionHead, "x-nv-vqos[0].bitStreamFormat", "2");
+
+            if (AppVersionQuad[0] >= 7) {
+                // Enable HDR if requested
+                if (StreamConfig.enableHdr) {
+                    err |= addAttributeString(&optionHead, "x-nv-video[0].dynamicRangeMode", "1");
+                }
+                else {
+                    err |= addAttributeString(&optionHead, "x-nv-video[0].dynamicRangeMode", "0");
+                }
+            }
+        }
+        else if (NegotiatedVideoFormat & VIDEO_FORMAT_MASK_H265) {
             err |= addAttributeString(&optionHead, "x-nv-clientSupportHevc", "1");
             err |= addAttributeString(&optionHead, "x-nv-vqos[0].bitStreamFormat", "1");
 
diff --git a/src/VideoDepacketizer.c b/src/VideoDepacketizer.c
index 738019e..0fd6c23 100644
--- a/src/VideoDepacketizer.c
+++ b/src/VideoDepacketizer.c
@@ -14,6 +14,7 @@ static bool waitingForIdrFrame;
 static bool waitingForRefInvalFrame;
 static unsigned int lastPacketInStream;
 static bool decodingFrame;
+static int frameType;
 static bool strictIdrFrameWait;
 static uint64_t syntheticPtsBase;
 static uint16_t frameHostProcessingLatency;
@@ -148,6 +149,9 @@ void destroyVideoDepacketizer(void) {
 }
 
 static bool getAnnexBStartSequence(PBUFFER_DESC current, PBUFFER_DESC startSeq) {
+    // We must not get called for other codecs
+    LC_ASSERT(NegotiatedVideoFormat & (VIDEO_FORMAT_MASK_H264 | VIDEO_FORMAT_MASK_H265));
+
     if (current->length < 3) {
         return false;
     }
@@ -207,6 +211,10 @@ void validateDecodeUnitForPlayback(PDECODE_UNIT decodeUnit) {
             // We get 2 sets of VPS, SPS, and PPS NALUs in HDR mode.
             // FIXME: Should we normalize this or something for clients?
         }
+        else if (NegotiatedVideoFormat & VIDEO_FORMAT_MASK_AV1) {
+            // We don't parse the AV1 bitstream
+            LC_ASSERT(decodeUnit->bufferList->bufferType == BUFFER_TYPE_PICDATA);
+        }
         else {
             LC_ASSERT(false);
         }
@@ -450,6 +458,7 @@ static void reassembleFrame(int frameNumber) {
         if (qdu != NULL) {
             qdu->decodeUnit.bufferList = nalChainHead;
             qdu->decodeUnit.fullLength = nalChainDataLength;
+            qdu->decodeUnit.frameType = frameType;
             qdu->decodeUnit.frameNumber = frameNumber;
             qdu->decodeUnit.frameHostProcessingLatency = frameHostProcessingLatency;
             qdu->decodeUnit.receiveTimeMs = firstPacketReceiveTime;
@@ -463,14 +472,10 @@ static void reassembleFrame(int frameNumber) {
             qdu->decodeUnit.hdrActive = LiGetCurrentHostDisplayHdrMode();
             qdu->decodeUnit.colorspace = (uint8_t)(qdu->decodeUnit.hdrActive ? COLORSPACE_REC_2020 : StreamConfig.colorSpace);
 
-            // IDR frames will have leading CSD buffers
-            if (nalChainHead->bufferType != BUFFER_TYPE_PICDATA) {
-                qdu->decodeUnit.frameType = FRAME_TYPE_IDR;
+            // Invoke the key frame callback if needed
+            if (qdu->decodeUnit.frameType == FRAME_TYPE_IDR) {
                 notifyKeyFrameReceived();
             }
-            else {
-                qdu->decodeUnit.frameType = FRAME_TYPE_PFRAME;
-            }
 
             nalChainHead = nalChainTail = NULL;
             nalChainDataLength = 0;
@@ -520,6 +525,11 @@ static int getBufferFlags(char* data, int length) {
     BUFFER_DESC buffer;
     BUFFER_DESC candidate;
 
+    // We only parse H.264 and HEVC bitstreams
+    if (!(NegotiatedVideoFormat & (VIDEO_FORMAT_MASK_H264 | VIDEO_FORMAT_MASK_H265))) {
+        return BUFFER_TYPE_PICDATA;
+    }
+
     buffer.data = data;
     buffer.length = (unsigned int)length;
     buffer.offset = 0;
@@ -612,7 +622,7 @@ static void queueFragment(PLENTRY_INTERNAL* existingEntry, char* data, int offse
 }
 
 // Process an RTP Payload using the slow path that handles multiple NALUs per packet
-static void processRtpPayloadSlow(PBUFFER_DESC currentPos, PLENTRY_INTERNAL* existingEntry) {
+static void processAvcHevcRtpPayloadSlow(PBUFFER_DESC currentPos, PLENTRY_INTERNAL* existingEntry) {
     // We should not have any NALUs when processing the first packet in an IDR frame
     LC_ASSERT(nalChainHead == NULL);
     LC_ASSERT(nalChainTail == NULL);
@@ -637,9 +647,6 @@ static void processRtpPayloadSlow(PBUFFER_DESC currentPos, PLENTRY_INTERNAL* exi
         start++;
 #endif
 
-        // Now we're decoding a frame
-        decodingFrame = true;
-
         if (isSeqReferenceFrameStart(currentPos)) {
             // No longer waiting for an IDR frame
             waitingForIdrFrame = false;
@@ -651,6 +658,9 @@ static void processRtpPayloadSlow(PBUFFER_DESC currentPos, PLENTRY_INTERNAL* exi
             // Use the cached LENTRY for this NALU since it will be
             // the bulk of the data in this packet.
             containsPicData = true;
+
+            // This is an IDR frame
+            frameType = FRAME_TYPE_IDR;
         }
 
         // Move to the next NALU
@@ -784,6 +794,7 @@ static void processRtpPayload(PNV_VIDEO_PACKET videoPacket, int length,
 
         // We're now decoding a frame
         decodingFrame = true;
+        frameType = FRAME_TYPE_PFRAME;
         firstPacketReceiveTime = receiveTimeMs;
         
         // Some versions of Sunshine don't send a valid PTS, so we will
@@ -810,6 +821,13 @@ static void processRtpPayload(PNV_VIDEO_PACKET videoPacket, int length,
             case 1: // Normal P-frame
                 break;
             case 2: // IDR frame
+                // For other codecs, we trust the frame header rather than parsing the bitstream
+                // to determine if a given frame is an IDR frame.
+                if (!(NegotiatedVideoFormat & (VIDEO_FORMAT_MASK_H264 | VIDEO_FORMAT_MASK_H265))) {
+                    waitingForIdrFrame = false;
+                    frameType = FRAME_TYPE_IDR;
+                }
+                // Fall-through
             case 4: // Intra-refresh
             case 5: // P-frame with reference frames invalidated
                 if (waitingForRefInvalFrame) {
@@ -905,49 +923,57 @@ static void processRtpPayload(PNV_VIDEO_PACKET videoPacket, int length,
             // Other versions don't have a frame header at all
         }
 
-        // The Annex B NALU start prefix must be next
-        if (!getAnnexBStartSequence(&currentPos, NULL)) {
-            // If we aren't starting on a start prefix, something went wrong.
-            LC_ASSERT(false);
+        // We only parse H.264 and HEVC at the NALU level
+        if (NegotiatedVideoFormat & (VIDEO_FORMAT_MASK_H264 | VIDEO_FORMAT_MASK_H265)) {
+            // The Annex B NALU start prefix must be next
+            if (!getAnnexBStartSequence(&currentPos, NULL)) {
+                // If we aren't starting on a start prefix, something went wrong.
+                LC_ASSERT(false);
 
-            // For release builds, we will try to recover by searching for one.
-            // This mimics the way most decoders handle this situation.
-            skipToNextNal(&currentPos);
-        }
+                // For release builds, we will try to recover by searching for one.
+                // This mimics the way most decoders handle this situation.
+                skipToNextNal(&currentPos);
+            }
 
-        // If an AUD NAL is prepended to this frame data, remove it.
-        // Other parts of this code are not prepared to deal with a
-        // NAL of that type, so stripping it is the easiest option.
-        if (isAccessUnitDelimiter(&currentPos)) {
-            skipToNextNal(&currentPos);
-        }
+            // If an AUD NAL is prepended to this frame data, remove it.
+            // Other parts of this code are not prepared to deal with a
+            // NAL of that type, so stripping it is the easiest option.
+            if (isAccessUnitDelimiter(&currentPos)) {
+                skipToNextNal(&currentPos);
+            }
 
-        // There may be one or more SEI NAL units prepended to the
-        // frame data *after* the (optional) AUD.
-        while (isSeiNal(&currentPos)) {
-            skipToNextNal(&currentPos);
+            // There may be one or more SEI NAL units prepended to the
+            // frame data *after* the (optional) AUD.
+            while (isSeiNal(&currentPos)) {
+                skipToNextNal(&currentPos);
+            }
         }
     }
 
-    if (firstPacket && isIdrFrameStart(&currentPos))
-    {
-        // SPS and PPS prefix is padded between NALs, so we must decode it with the slow path
-        processRtpPayloadSlow(&currentPos, existingEntry);
-    }
-    else
-    {
-        // Intel's H.264 Media Foundation encoder prepends a PPS to each P-frame.
-        // Skip it to avoid confusing clients.
-        if (firstPacket && isPictureParameterSetNal(&currentPos)) {
-            skipToNextNal(&currentPos);
+    if (NegotiatedVideoFormat & (VIDEO_FORMAT_MASK_H264 | VIDEO_FORMAT_MASK_H265)) {
+        if (firstPacket && isIdrFrameStart(&currentPos)) {
+            // SPS and PPS prefix is padded between NALs, so we must decode it with the slow path
+            processAvcHevcRtpPayloadSlow(&currentPos, existingEntry);
         }
+        else {
+            // Intel's H.264 Media Foundation encoder prepends a PPS to each P-frame.
+            // Skip it to avoid confusing clients.
+            if (firstPacket && isPictureParameterSetNal(&currentPos)) {
+                skipToNextNal(&currentPos);
+            }
 
 #ifdef FORCE_3_BYTE_START_SEQUENCES
-        if (firstPacket) {
-            currentPos.offset++;
-            currentPos.length--;
-        }
+            if (firstPacket) {
+                currentPos.offset++;
+                currentPos.length--;
+            }
 #endif
+
+            queueFragment(existingEntry, currentPos.data, currentPos.offset, currentPos.length);
+        }
+    }
+    else {
+        // Other codecs are just passed through as is.
         queueFragment(existingEntry, currentPos.data, currentPos.offset, currentPos.length);
     }
 
@@ -991,7 +1017,7 @@ static void processRtpPayload(PNV_VIDEO_PACKET videoPacket, int length,
         // depacketizer will next try to process a non-SOF packet,
         // and cause it to assert.
         if (dropStatePending) {
-            if (nalChainHead && nalChainHead->bufferType != BUFFER_TYPE_PICDATA) {
+            if (nalChainHead && frameType == FRAME_TYPE_IDR) {
                 // Don't drop the frame state if this frame is an IDR frame itself,
                 // otherwise we'll lose this IDR frame without another in flight
                 // and have to wait until we hit our consecutive drop limit to