label-studio.git

title: Evaluate Production Conversations for RLHF
type: enterprise
group: Chat
order: 5
image: /static/templates/chat-rlhf.png
details: |
  <h1>Bring production chats into this template to learn why your agent succeeds or fails</h1>
  <dl>
    <dt>Industry Applications</dt>
    <dd>RLHF data collection, production conversation analysis, agent improvement, user preference learning, failure analysis, success pattern identification, fine-tuning data generation, quality monitoring, real-world behavior evaluation, feedback collection</dd>
    <dt>Associated Models</dt>
    <dd>RLHF training, preference learning, conversation evaluation, quality metrics</dd>
    <dt>Domain Terminology</dt>
    <dd>RLHF, human feedback, preference labels, conversation-level evaluation, per-message evaluation, production conversations, agent performance, success patterns</dd>
  </dl>
config: |
  <View>
    <Style>
      .chat {
        border: 1px solid var(--color-neutral-border);
        padding: var(--spacing-tight);
        border-radius: var(--corner-radius-medium);
        background-color: var(--color-neutral-background);
      }
      .evaluation {
          border: 2px solid var(--color-accent-canteloupe-base);
          background-color: var(--color-accent-canteloupe-subtlest);
          color: var(--color-accent-canteloupe-bold);
          padding: var(--spacing-tight);
          border-radius: var(--corner-radius-medium);
          margin-bottom: var(--spacing-base);
      }
      <!-- Choice text -->
      .evaluation span {
          color: var(--color-accent-canteloupe-bold);
      }
      <!-- Star rating -->
      .evaluation .ant-rate-star.ant-rate-star-full span {
        color: var(--color-accent-canteloupe-base);
       }
     
      .overall-chat {
         border-bottom: 1px solid var(--color-neutral-border);
         margin-bottom: var(--spacing-base);
      }
      .instructions {
         color: var(--color-accent-canteloupe-bold);
         background-color: var(--color-accent-canteloupe-subtlest);
         padding: var(--spacing-tight);
         border-radius: var(--corner-radius-medium);
         border: 1px solid var(--color-accent-canteloupe-subtle);
      }
      <!-- Allow enlarging the instruction text -->
      .lsf-richtext__container.lsf-htx-richtext {
        font-size: 16px !important;
        line-height: 1.6;
      }
      
      <!-- Remove excess height from the chat to allow space for instruction text -->
      .htx-chat { 
        --excess-height: 275px;
        background-color: var(--color-neutral-background);
      }
    </Style>
    <View style="display: flex; gap: var(--spacing-wide);">
      
      <!-- Left: conversation -->
      <View className="chat" style="flex: 2;">
        <View className="instructions">
          <Text name="instructions" value="Review the conversation in detail. 
                                           As you read through it, click on individual messages to 
                                           provide feedback about accuracy, clarity, and intent." />
        </View>
        
        <Chat name="chat" value="$chat" 
              minMessages="2" 
              editable="false" />
      </View>
 
      <!-- Right: conversation and message evaluation -->
      <View style="flex: 1; display: flex; flex-direction: column;" className="evaluation">
        
        <!-- Evaluate the whole conversation -->
        <View className="overall-chat">
          <Header size="4">Overall quality of this conversation</Header>
          <Rating name="rating" toName="chat" />
          
          <View style="padding-top: var(--spacing-base);">
            <Text name="add_comment" value="Add a comment (optional)" />
            <TextArea name="conversation_comment" toName="chat" />
          </View>
        </View>
 
        <!-- Per-message evaluation -->
        <View style="flex: 1; overflow: auto;">
        
          <!-- Only visible when no message is selected -->
          <View visibleWhen="no-region-selected">
            <Header value="Click on a message to evaluate" size="3"/>
          </View>
 
          <!-- Only visible when an assistant message is selected -->
          <View visibleWhen="region-selected" whenRole="assistant">
            <Header value="Response Accuracy"/>
            <Rating name="assistant_response_accuracy" toName="chat" perRegion="true" maxRating="5" icon="star"/>
            
            <Header value="Relevance"/>
            <Choices name="relevance" toName="chat" perRegion="true" >
              <Choice value="Highly relevant" />
              <Choice value="Somewhat relevant" />
              <Choice value="Not relevant" />
            </Choices>
            
            <Header value="Correctness"/>
            <Choices name="correctness" toName="chat" perRegion="true" >
              <Choice value="Accurate and factual" />
              <Choice value="Mostly accurate with minor issues" />
              <Choice value="Contains significant errors" />
            </Choices>
            
            <Header value="Tone"/>
            <Choices name="tone" toName="chat" perRegion="true" choice="multiple" >
              <Choice value="Too formal" />
              <Choice value="Too casual" />
              <Choice value="Appropriate" />
            </Choices>
            
            <Header value="Add a comment (optional)"/>
            <TextArea perRegion="true" name="message_comment" toName="chat" />
          </View>
        </View>
      </View>
    </View>
  </View>
 
  <!-- {
    "data": {
      "text": "Review the conversation in detail. As you read through it, click on individual messages to provide feedback about accuracy, clarity, and intent.",
      "chat": [
        {
          "role": "user",
          "content": "I'm onboarding a new annotator. What are the first steps to ensure quality and speed?"
        },
        {
          "role": "assistant",
          "content": "Start by creating their account and assigning the Annotator role so permissions are scoped correctly. Add concise labeling instructions that appear before labeling. Use a small ground-truth set and require overlap (e.g., 2 annotators per task) with a reviewer to resolve disagreements. Enable comments so reviewers can leave precise, in-task feedback."
        }
      ]
    }
  } -->