.env

# 建议使用WebUI进行配置，配置后可导出.env文件

# Sample and explanation can be found in .env.example

LOG_GENERAL=true
LOG_OPENAI=true

CACHE_GENERAL=true
CACHE_OPENAI=true

CHAT_COMPLETION_ROUTE=/v1/chat/completions
#CUSTOM_GENERAL_ROUTE=/v1/models/gemini-pro
CACHE_ROUTES=["/v1/chat/completions","/v1/embeddings"]

# `CACHE_BACKEND`: Options (MEMORY, LMDB, LevelDB)
CACHE_BACKEND=MEMORY
#CACHE_ROOT_PATH_OR_URL=./FLAXKV_DB

DEFAULT_REQUEST_CACHING_VALUE=false

BENCHMARK_MODE=true

FORWARD_CONFIG=[{"base_url":"https://api.openai.com","route":"/","type":"openai"}]
#FORWARD_CONFIG=[{"base_url":"https://api.deepseek.com","route":"/","type":"openai"}]
#FORWARD_CONFIG=[{"base_url":"http://localhost:3000","route":"/","type":"general"}]
#CUSTOM_MODEL_CONFIG='{
#"backend":"ollama",
#"model_map": {"gpt-3.5-turbo":"qwen2:7b"},
#"api_base": "http://localhost:11434"
#}'

#LEVEL_MODELS='{"1":["gpt-4"],"2":["gpt-3.5-turbo"]}'
#OPENAI_API_KEY_CONFIG='{"sk-xxx": [0], "sk-xxx": [1], "sk-xxx": [1,2]}'
#FORWARD_KEY_CONFIG='{"0": ["fk-0"], "1":["fk-1", "fk-11"], "2": ["fk-2"]}'

# `REQ_RATE_LIMIT`: i.e., Request rate limit for specified routes, user specific
# format: {route: ratelimit-string}
# ratelimit-string format [count] [per|/] [n (optional)] [second|minute|hour|day|month|year] :ref:`ratelimit-string`: https://limits.readthedocs.io/en/stable/quickstart.html#rate-limit-string-notation
REQ_RATE_LIMIT='{
"/v1/chat/completions":[{"level":0,"limit":"100/2minutes"}],
"/v1/completions":[{"level":0,"limit":"60/minute;600/hour"}]
}'

# Backend for rate limiting: [memory, redis, memcached, ...] :ref: https://limits.readthedocs.io/en/stable/storage.html#
#REQ_RATE_LIMIT_BACKEND=redis://localhost:6379

# `GLOBAL_RATE_LIMIT`: Limits all routes not specified by `REQ_RATE_LIMIT`. If not set, there's no limit by default.
GLOBAL_RATE_LIMIT=200/minute

#`RATE_LIMIT_STRATEGY` Options: (fixed-window, fixed-window-elastic-expiry, moving-window) :ref: https://limits.readthedocs.io/en/latest/strategies.html
# `fixed-window`: most memory efficient strategy; `moving-window`:most effective for preventing bursts but has higher memory cost.
RATE_LIMIT_STRATEGY=moving-window

# Rate limit for returned tokens
TOKEN_RATE_LIMIT='{
"/v1/chat/completions":[{"level":0,"limit":"100/second"}],
"/v1/completions":[{"level":0,"limit":"60/second"}],
"/benchmark/v1/chat/completions":[{"level":0,"limit":"20/second"}]
}'

# TCP connection timeout duration (in seconds)
TIMEOUT=6

ITER_CHUNK_TYPE=one-by-one
#ITER_CHUNK_TYPE=efficiency

#IP_BLACKLIST=
WEBUI_RESTART_PORT=15555
WEBUI_LOG_PORT=15556

DEFAULT_STREAM_RESPONSE=true
# Set timezone
TZ=Asia/Shanghai