Spaces:

merterbak
/

gpt-oss-20b-demo

Running on Zero

App Files Files Community

merterbak commited on Sep 10

Commit

ae0ab06

verified ·

1 Parent(s): 0c86c79

Harmony attempt #1 blended with simple formatting

Browse files

Files changed (1) hide show

app.py +51 -7

app.py CHANGED Viewed

@@ -4,6 +4,13 @@ from threading import Thread
 import gradio as gr
 import spaces
 import re
 model_id = "openai/gpt-oss-20b"
@@ -12,7 +19,9 @@ pipe = pipeline(
     model=model_id,
     torch_dtype="auto",
     device_map="auto",
 )
 def format_conversation_history(chat_history):
     messages = []
     for item in chat_history:
@@ -22,6 +31,34 @@ def format_conversation_history(chat_history):
             content = content[0]["text"] if content and "text" in content[0] else str(content)
         messages.append({"role": role, "content": content})
     return messages
 @spaces.GPU()
 def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
@@ -29,7 +66,12 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
     system_message = [{"role": "system", "content": system_prompt}] if system_prompt else []
     processed_history = format_conversation_history(chat_history)
     messages = system_message + processed_history + [new_message]
     streamer = TextIteratorStreamer(pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         "max_new_tokens": max_new_tokens,
         "do_sample": True,
@@ -37,18 +79,19 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
         "top_p": top_p,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
-        "streamer": streamer
     }
-    thread = Thread(target=pipe, args=(messages,), kwargs=generation_kwargs)
     thread.start()
-    # simple formatting without harmony because of no tool usage etc. and experienced hf space problems with harmony
     thinking = ""
     final = ""
     started_final = False
     for chunk in streamer:
         if not started_final:
             if "assistantfinal" in chunk.lower():
-                split_parts = re.split(r'assistantfinal', chunk, maxsplit=1)
                 thinking += split_parts[0]
                 final += split_parts[1]
                 started_final = True
@@ -56,7 +99,7 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
                 thinking += chunk
         else:
             final += chunk
-        clean_thinking = re.sub(r'^analysis\s*', '', thinking).strip()
         clean_final = final.strip()
         formatted = f"<details open><summary>Click to view Thinking Process</summary>\n\n{clean_thinking}\n\n</details>\n\n{clean_final}"
         yield formatted
@@ -78,8 +121,9 @@ demo = gr.ChatInterface(
     ],
     examples=[
         [{"text": "Explain Newton laws clearly and concisely"}],
-        [{"text": "Write a Python function to calculate the Fibonacci sequence"}],
         [{"text": "What are the benefits of open weight AI models"}],
     ],
     cache_examples=False,
     type="messages",
@@ -96,4 +140,4 @@ Give it a couple of seconds to start. You can adjust reasoning level in the syst
 )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import gradio as gr
 import spaces
 import re
+from openai_harmony import (
+    load_harmony_encoding,
+    HarmonyEncodingName,
+    Role,
+    Message,
+    Conversation,
+)
 model_id = "openai/gpt-oss-20b"
     model=model_id,
     torch_dtype="auto",
     device_map="auto",
+    trust_remote_code=True,
 )
+enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
 def format_conversation_history(chat_history):
     messages = []
     for item in chat_history:
             content = content[0]["text"] if content and "text" in content[0] else str(content)
         messages.append({"role": role, "content": content})
     return messages
+#OpenAI's harmony format
+def build_harmony_conversation_from_messages(messages):
+    harmony_messages = []
+    for m in messages:
+        role = m["role"].lower()
+        content = m["content"]
+        if role == "system":
+            harmony_messages.append(
+                Message.from_role_and_content(
+                    Role.SYSTEM,
+                    content,
+                )
+            )
+        elif role == "user":
+            harmony_messages.append(
+                Message.from_role_and_content(
+                    Role.USER,
+                    content,
+                )
+            )
+        elif role == "assistant":
+            harmony_messages.append(
+                Message.from_role_and_content(
+                    Role.ASSISTANT,
+                    content,
+                )
+            )
+    return Conversation.from_messages(harmony_messages)
 @spaces.GPU()
 def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
     system_message = [{"role": "system", "content": system_prompt}] if system_prompt else []
     processed_history = format_conversation_history(chat_history)
     messages = system_message + processed_history + [new_message]
+    conversation = build_harmony_conversation_from_messages(messages)
+    prompt_tokens = enc.render_conversation_for_completion(conversation, Role.ASSISTANT)
+    prompt_text = pipe.tokenizer.decode(prompt_tokens, skip_special_tokens=False)
     streamer = TextIteratorStreamer(pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         "max_new_tokens": max_new_tokens,
         "do_sample": True,
         "top_p": top_p,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
+        "streamer": streamer,
+        "return_full_text": False,
     }
+    thread = Thread(target=pipe, args=(prompt_text,), kwargs=generation_kwargs)
     thread.start()
     thinking = ""
     final = ""
     started_final = False
     for chunk in streamer:
         if not started_final:
             if "assistantfinal" in chunk.lower():
+                split_parts = re.split(r'(?i)assistantfinal', chunk, maxsplit=1)
                 thinking += split_parts[0]
                 final += split_parts[1]
                 started_final = True
                 thinking += chunk
         else:
             final += chunk
+        clean_thinking = re.sub(r'^analysis\s*', '', thinking, flags=re.I).strip()
         clean_final = final.strip()
         formatted = f"<details open><summary>Click to view Thinking Process</summary>\n\n{clean_thinking}\n\n</details>\n\n{clean_final}"
         yield formatted
     ],
     examples=[
         [{"text": "Explain Newton laws clearly and concisely"}],
         [{"text": "What are the benefits of open weight AI models"}],
+        [{"text": "Write a Python function to calculate the Fibonacci sequence"}],
     ],
     cache_examples=False,
     type="messages",
 )
 if __name__ == "__main__":
+    demo.launch()