from urllib.request import urlopen from openai import OpenAI test_cases = { "64k": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt", "200k": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt", "600k": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt", "1m": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt", } client = OpenAI(api_key="EMPTY", base_url="http://127.0.0.1:30000/v1") for name, url in test_cases.items(): print(f"\n==== Running test case: {name} ====") try: with urlopen(url, timeout=10) as response: prompt = response.read().decode("utf-8") except Exception as e: print(f"Failed to load prompt for {name}: {e}") continue try: response = client.chat.completions.create( model="meta-llama/Llama-4-Scout-17B-16E-Instruct", messages=[{"role": "user", "content": prompt}], stream=True, max_tokens=128, temperature=0, ) for chunk in response: if chunk.choices and chunk.choices[0].delta.content is not None: print(chunk.choices[0].delta.content, end="", flush=True) except Exception as e: print(f"\nError during completion for {name}: {e}")