35 lines
1.5 KiB
Python
35 lines
1.5 KiB
Python
from dataclasses import dataclass, field
|
|
from typing import Dict, List
|
|
|
|
|
|
@dataclass
|
|
class Arguments:
|
|
# fmt: off
|
|
"""
|
|
A dataclass to store and manage the arguments for the model configuration and data processing.
|
|
"""
|
|
"""
|
|
For CLIP model support, you can use the following fields:
|
|
model_name: str
|
|
revision: str = "master"
|
|
hub: str = "modelscope"
|
|
|
|
For API VLM model support, you can use the following fields, (image caption only):
|
|
model_name="gpt-4o-mini"
|
|
api_base: str = "",
|
|
api_key: Optional[str] = None
|
|
prompt: str = None
|
|
"""
|
|
models: List[Dict] = field(default_factory=dict) # List of paths to the pre-trained models or model identifiers
|
|
dataset_name: List[str] = field(default_factory=list) # List of dataset names to be used
|
|
data_dir: str = None # Root directory where the datasets are stored
|
|
split: str = 'test' # Split of the dataset to be used (e.g., 'train', 'validation', 'test')
|
|
task: str = None
|
|
batch_size: int = 128 # Batch size for data loading
|
|
num_workers: int = 1 # Number of workers for data loading
|
|
verbose: bool = True # Flag to enable verbose logging
|
|
output_dir: str = 'outputs' # Directory where the outputs (e.g., predictions, logs) will be saved
|
|
cache_dir: str = 'cache' # Directory where the dataset cache will be stored
|
|
skip_existing: bool = False # Flag to skip processing if outputs already exist
|
|
limit: int = None # Limit the number of samples to be processed
|