sglang_v0.5.2/sglang/sgl-router/tests/tool_parser_mixed_edge_case...

302 lines
9.8 KiB
Rust

//! Mixed Format and Additional Edge Case Tests
//!
//! Tests for edge cases across parsers and mixed format scenarios
use serde_json::json;
use sglang_router_rs::tool_parser::{
JsonParser, LlamaParser, MistralParser, ParseState, PythonicParser, QwenParser, StreamResult,
ToolParser,
};
#[tokio::test]
async fn test_mixed_formats_in_text() {
// Test that parsers correctly ignore other formats' markers
let json_parser = JsonParser::new();
let input = r#"
Some text with [TOOL_CALLS] marker that shouldn't trigger.
Also has <tool_call> tags and [function()] syntax.
But here's the actual JSON: {"name": "test", "arguments": {}}
"#;
let result = json_parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].function.name, "test");
// Mistral parser should ignore JSON and other formats
let mistral_parser = MistralParser::new();
let input = r#"
{"name": "fake"} [function()] <tool_call>
[TOOL_CALLS] [{"name": "real", "arguments": {}}]
"#;
let result = mistral_parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].function.name, "real");
}
#[tokio::test]
async fn test_format_markers_in_string_content() {
// Test that format markers inside string content don't interfere
let pythonic_parser = PythonicParser::new();
let input = r#"[echo(text="Use [TOOL_CALLS] and <tool_call> in text")]"#;
let result = pythonic_parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
assert_eq!(args["text"], "Use [TOOL_CALLS] and <tool_call> in text");
let qwen_parser = QwenParser::new();
let input = r#"<tool_call>
{"name": "log", "arguments": {"msg": "Found [function()] pattern"}}
</tool_call>"#;
let result = qwen_parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
assert_eq!(args["msg"], "Found [function()] pattern");
}
#[tokio::test]
async fn test_deeply_nested_json_structures() {
let json_parser = JsonParser::new();
let input = r#"{
"name": "deep_process",
"arguments": {
"level1": {
"level2": {
"level3": {
"level4": {
"level5": {
"data": [1, 2, [3, [4, 5]]]
}
}
}
}
}
}
}"#;
let result = json_parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].function.name, "deep_process");
let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
assert!(args["level1"]["level2"]["level3"]["level4"]["level5"]["data"].is_array());
}
#[tokio::test]
async fn test_multiple_sequential_calls_different_formats() {
// Simulate a scenario where different parts of text have different formats
// (though each parser will only recognize its own format)
let llama_parser = LlamaParser::new();
// Llama parser currently only returns the first tool found
let input = r#"First call: <|python_tag|>{"name": "call1", "arguments": {}}"#;
let result = llama_parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].function.name, "call1");
// Test plain JSON separately
let input2 = r#"{"name": "call2", "arguments": {"x": 1}}"#;
let result2 = llama_parser.parse_complete(input2).await.unwrap();
assert_eq!(result2.len(), 1);
assert_eq!(result2[0].function.name, "call2");
}
#[tokio::test]
async fn test_empty_and_whitespace_variations() {
let json_parser = JsonParser::new();
// Various whitespace scenarios
let cases = vec![
r#" {"name":"compact","arguments":{}} "#,
r#"
{"name": "spaced", "arguments": {}}
"#,
r#" {"name": "tabbed", "arguments": {}} "#, // tabs
];
for input in cases {
let result = json_parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1, "Should parse regardless of whitespace");
}
}
#[tokio::test]
async fn test_special_json_values() {
let json_parser = JsonParser::new();
// Test various special JSON values
let input = r#"{
"name": "test_special",
"arguments": {
"float_e": 1.23e10,
"float_neg_e": 1.23e-10,
"hex_like": "0x1234",
"very_long_num": 99999999999999999999,
"special_strings": ["", " ", "\u0000", "\u001f"],
"escaped": "\\n\\r\\t\\\"\\\\",
"unicode": "\u4e2d\u6587"
}
}"#;
let result = json_parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].function.name, "test_special");
let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
assert!(args["special_strings"].is_array());
assert!(args["escaped"].is_string());
}
#[tokio::test]
async fn test_parser_recovery_after_invalid_input() {
let mut state = ParseState::new();
let parser = JsonParser::new();
// Send invalid JSON first
let _ = parser.parse_incremental(r#"{"broken": "#, &mut state).await;
// Clear state and try valid JSON
state.buffer.clear();
let result = parser
.parse_incremental(r#"{"name": "valid", "arguments": {}}"#, &mut state)
.await
.unwrap();
match result {
StreamResult::ToolComplete(tool) => {
assert_eq!(tool.function.name, "valid");
}
_ => {
// Might be incomplete depending on implementation
}
}
}
#[tokio::test]
async fn test_boundary_cases_for_extraction() {
// Test edge cases in JSON extraction from text
let json_parser = JsonParser::new();
// JSON at the very beginning
let input = r#"{"name": "start", "arguments": {}} and then text"#;
let result = json_parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].function.name, "start");
// JSON at the very end
let input = r#"Some text first {"name": "end", "arguments": {}}"#;
let result = json_parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].function.name, "end");
// Multiple JSON objects in text (should find first valid one)
let input =
r#"Text {"name": "first", "arguments": {}} more {"name": "second", "arguments": {}}"#;
let result = json_parser.parse_complete(input).await.unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].function.name, "first");
}
#[tokio::test]
async fn test_pythonic_edge_cases() {
let parser = PythonicParser::new();
// Function name with underscores and numbers
let input = r#"[func_name_2(param_1="value")]"#;
let result = parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].function.name, "func_name_2");
// Empty string argument
let input = r#"[process(text="")]"#;
let result = parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
assert_eq!(args["text"], "");
}
#[tokio::test]
async fn test_mistral_with_pretty_json() {
let parser = MistralParser::new();
// Pretty-printed JSON in Mistral format
let input = r#"[TOOL_CALLS] [
{
"name": "formatted",
"arguments": {
"nested": {
"key": "value"
},
"array": [
1,
2,
3
]
}
}
]"#;
let result = parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].function.name, "formatted");
let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
assert_eq!(args["nested"]["key"], "value");
assert_eq!(args["array"], json!([1, 2, 3]));
}
#[tokio::test]
async fn test_qwen_with_cdata_like_content() {
let parser = QwenParser::new();
// Test with content that looks like CDATA but isn't
// Note: QwenParser expects exactly "<tool_call>\n" with the newline
let input = r#"<tool_call>
{"name": "process", "arguments": {"xml": "<![CDATA[some data]]>"}}
</tool_call>"#;
let result = parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].function.name, "process");
let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
assert_eq!(args["xml"], "<![CDATA[some data]]>");
}
#[tokio::test]
async fn test_extremely_long_function_names() {
let parser = PythonicParser::new();
let long_name = "very_long_function_name_that_might_appear_in_generated_code_somewhere";
let input = format!(r#"[{}(param="value")]"#, long_name);
let result = parser.parse_complete(&input).await.unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].function.name, long_name);
}
#[tokio::test]
async fn test_json_with_duplicate_keys() {
let parser = JsonParser::new();
// JSON with duplicate keys (last one should win per JSON spec)
let input = r#"{"name": "test", "arguments": {"key": "first", "key": "second"}}"#;
let result = parser.parse_complete(input).await.unwrap();
assert_eq!(result.len(), 1);
let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
// JSON parsers typically keep the last value for duplicate keys
assert_eq!(args["key"], "second");
}