
    *iw                      S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKJrJrJrJrJrJrJrJrJrJrJr  S SKJr  S SKJr  SSK7  SSKJr  SS	K J!r!J"r"J#r#J$r$  SS
K%J&r&J'r'  S SK(J)r)  S SK*J+r+  S SK,J-r-  S SK.r/S SK0Jr1  S SK2J3r4  SSK5J6r6  SSK7J8r8   " S S5      r9 " S S5      r:\\1Rv                  \/Rx                     \1Rv                  \/Rz                     /\1Rv                  \/Rz                     4   r> " S S\\>   5      r?\\1Rv                  \/Rx                     \1Rv                  \/Rz                     /\@4   rA " S S\\A   5      rB " S S\>5      rCg)    )annotationsN)AnyListLiteralOptionalUnion	GeneratorSequenceIteratorDequeCallableDict)deque)Path   )*)LlamaGrammar)BaseLlamaCache
LlamaCacheLlamaDiskCacheLlamaRAMCache)BaseLlamaTokenizerLlamaTokenizer)LlamaDraftModel)set_verbose)suppress_stdout_stderrc            ,         \ rS rSrSrSrS\R                  SSSSSS\R                  SSSSS\R                  \R                  SSS	S
SS
SSSSSSSSSSS
SSSSSSSSSSS.+                                                                                       S?S jjr\S@S j5       r\SAS j5       r\SBS j5       r\SCS j5       r\SDS j5       r\SES j5       r SF       SGS jjr  SH       SIS jjrSJS jrSKS jrS rSLS jr               SM                             SNS! jjr                SO                               SPS" jjr                 SQ                                     SRS# jjr SS     STS$ jjr   SU       SVS% jjrSS&SSSS
SS/ SSS
SSSS
SS SSSSSS4                                                   SWS' jjrSS&SSSS
SS/ SSS
SSSS
SS SSSSSS4                                                   SXS( jjrSS&SSSS
SS/ SSS
SSSS
SS SSSSSS4                                                   SYS) jjr SSSSS*SSSS
S/ SSSSSS
S
SS SSSSSSS4                                                         SZS+ jjr!    S[S, jr"S- r#S. r$S\S/ jr%S]S0 jr&S^S1 jr'S^S2 jr(S^S3 jr)S_S4 jr*S^S5 jr+S^S6 jr,S^S7 jr-S`S8 jr.SaS9 jr/SaS: jr0\1 Sb     ScS; jj5       r2\1SdS< j5       r3\4    Se               SfS= jj5       r5S>r6g)gLlama7   z0High-level Python wrapper for a llama.cpp model.Fr   NTi           g            ?g      @@@   )+n_gpu_layers
split_modemain_gputensor_split
vocab_onlyuse_mmap	use_mlockkv_overridesseedn_ctxn_batchn_ubatch	n_threadsn_threads_batchrope_scaling_typepooling_typerope_freq_baserope_freq_scaleyarn_ext_factoryarn_attn_factoryarn_beta_fastyarn_beta_slowyarn_orig_ctx
logits_all	embeddingoffload_kqv
flash_attn
op_offloadswa_fullno_perflast_n_tokens_size	lora_base
lora_scale	lora_pathnumachat_formatchat_handlerdraft_model	tokenizertype_ktype_v
spm_infillverbosec       +        ,   ^  U,T l         [        R                  " 5       T l        [	        U,5        [
        R                  (       d2  [        U,S9   [        R                  " 5         SSS5        S[
        l        [        U$[        5      (       a-  U$(       a  [        R                  O[        R                  T l        OU$T l        T R                  [        R                  :w  a2  [        U,S9   [        R                  " T R                  5        SSS5        UT l        [        R"                  " 5       T l        US:X  a  SOUT R$                  l        UT R$                  l        UT R$                  l        UT l        ST l        T R,                  b  [1        T R,                  5      [        R2                  :  a  [5        S[        R2                   35      e[6        R8                  [        R2                  -  n.U." U6 T l        T R.                  T R$                  l        UT R$                  l        U#c  UOST R$                  l        UT R$                  l        U	T l         U	Gb  [1        U	5      S-   n/[        RB                  U/-  " 5       T l"        [G        U	RI                  5       5       GH{  u  n0u  n1n2U1RK                  S	5      T RD                  U0   l&        [        U2[        5      (       aB  [        RN                  T RD                  U0   l(        U2T RD                  U0   RR                  l*        M  [        U2[V        5      (       aB  [        RX                  T RD                  U0   l(        U2T RD                  U0   RR                  l-        M  [        U2[\        5      (       aC  [        R^                  T RD                  U0   l(        U2T RD                  U0   RR                  l0        GM3  [        U2[b        5      (       Ga$  U2RK                  S	5      n3[1        U35      S
:  a  [5        SU1 SU2 35      eU3Re                  S
S5      n3[        Rf                  T RD                  U0   l(        [h        Rj                  " [V        [6        Rl                  " T RD                  U0   RR                  5      [        Rn                  Rp                  Rr                  -   5      n4[6        Rj                  " U4[6        Rt                  " [6        Rv                  5      5      n5[6        Rx                  " U5U3S
5        GMm  [5        SU1 SU2 35      e   ST RD                  S   l&        T RD                  T R$                  l         [{        X5      T l>        U=(       d"    [        [        R                  " 5       S-  S5      T lB        U=(       d    [        R                  " 5       T lC        U
=(       d    [        R                  T lE        [        R                  " 5       T lG        UT R                  lH        T R|                  T R                  l>        [{        T R|                  U5      T R                  lI        T R                  T R                  lB        T R                  T R                  lC        Ub  UO[        R                  T R                  lK        UT R                  lL        US:w  a  UOST R                  lM        US:w  a  UOST R                  lN        US:w  a  UOST R                  lO        US:w  a  UOST R                  lP        US:w  a  UOST R                  lQ        US:w  a  UOST R                  lR        US:w  a  UOST R                  lS        U'c  UOST lT        UT R                  lU        UT R                  lV        UT R                  lW        Ub  UT R                  lX        Ub  UT R                  lY        U)b  U)T R                  lZ        U*b  U*T R                  l[        UT R                  l\        U T l]        ST l^        U!T l_        U"T l`        U#T la        U+T lb        [        R                  R                  U5      (       d  [5        SU 35      eT R                  R                  [        R                  " [        R                  " T R                   T R$                  T R                   S95      5      T lj        U(=(       d    [        T 5      T ll        US:X  a  T R                  R                  5       n[{        X5      T l>        T R                  R                  5       T R                  lH        T R|                  T R                  l>        [{        T R|                  U5      T R                  lI        T R                  R                  [        R                  " [        R                  " T R                  T R                  T R                   S95      5      T lo        T R                  R                  [        R                  " [        R                  " T R|                  ST R                  R                  T R                   S95      5      T lq        ST lr        T R                  (       a  [        R                  " T R                  R                  T R                  RK                  S	5      5      T lr        T R                  c  [        ST R                   35      eU 4S jn6T R                  R                  U65        [        R                  " T R                  R                  T R                  T R                  5      (       a  [        ST R                   35      eT R                   (       a:  [        [        R                  " 5       R                  S	5      [        R                  S9  U%T l~        U&T l        0 T l        U'T l        T GR                  5       T l        T R                  5       T l        T GR                  5       T l        T GR                  5       T l        [        GR                  " T GR                  S9T l        ST l        G[        GR                  " U4G[        GR                  S9T l        G[        GR                  " US:X  a  UOUT GR                  4G[        GR                   S9T l        [6        R8                  " S5      T l         T R                  GR'                  5       T l        T R                   (       a&  [        ST GR&                   3[        R                  S9  T GR                  5       n8T GR+                  5       n9U8S:w  a  T R                  GR-                  U85      OS n:U9S:w  a  T R                  GR-                  U95      OS n;G[/        S! T GR&                  RI                  5        5       5      n<S"T GR&                  ;   a  T GR&                  S"   U<S#'   T R                   (       aA  U<(       a:  [        S$S%GR1                  U<GR3                  5       5       3[        R                  S9  U<RI                  5        H=  u  n=n>G[4        GR6                  " U>U:U;U8/S&9GR9                  5       T GR                   U='   M?     T R                  c  T R                  c  S#U<;   a  G[4        GR:                  " T GR&                  5      n%U%b4  U%T l~        T R                   (       a  [        S'U% 3[        R                  S9  OlT R                   (       aT  [        S(U<S#    3[        R                  S9  [        S)U: 3[        R                  S9  [        S*U; 3[        R                  S9  S#T l~        T R                  cJ  T R                  c=  S+T l~        T R                   (       a%  [        S,T R                   3[        R                  S9  ST l        g! , (       d  f       GNK= f! , (       d  f       GN= f! G[(         a>  n70 T l        T R                   (       a  [        SU7 3[        R                  S9   Sn7A7GNASn7A7ff = f)-a{  Load a llama.cpp model from `model_path`.

Examples:
    Basic usage

    >>> import llama_cpp
    >>> model = llama_cpp.Llama(
    ...     model_path="path/to/model",
    ... )
    >>> print(model("The quick brown fox jumps ", stop=["."])["choices"][0]["text"])
    the lazy dog

    Loading a chat model

    >>> import llama_cpp
    >>> model = llama_cpp.Llama(
    ...     model_path="path/to/model",
    ...     chat_format="llama-2",
    ... )
    >>> print(model.create_chat_completion(
    ...     messages=[{
    ...         "role": "user",
    ...         "content": "what is the meaning of life?"
    ...     }]
    ... ))

Args:
    model_path: Path to the model.
    n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
    split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
    main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
    tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
    vocab_only: Only load the vocabulary no weights.
    use_mmap: Use mmap if possible.
    use_mlock: Force the system to keep the model in RAM.
    kv_overrides: Key-value overrides for the model.
    seed: RNG seed, -1 for random
    n_ctx: Text context, 0 = from model
    n_batch: Prompt processing maximum batch size
    n_ubatch: Physical batch size
    n_threads: Number of threads to use for generation
    n_threads_batch: Number of threads to use for batch processing
    rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
    pooling_type: Pooling type, from `enum llama_pooling_type`.
    rope_freq_base: RoPE base frequency, 0 = from model
    rope_freq_scale: RoPE frequency scaling factor, 0 = from model
    yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
    yarn_attn_factor: YaRN magnitude scaling factor
    yarn_beta_fast: YaRN low correction dim
    yarn_beta_slow: YaRN high correction dim
    yarn_orig_ctx: YaRN original context size
    logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
    embedding: Embedding mode only.
    offload_kqv: Offload K, Q, V to GPU.
    flash_attn: Use flash attention.
    op_offload: offload host tensor operations to device
    swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    no_perf: Measure performance timings.
    last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
    lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
    lora_path: Path to a LoRA file to apply to the model.
    numa: numa policy
    chat_format: String specifying the chat format to use when calling create_chat_completion.
    chat_handler: Optional chat handler to use when calling create_chat_completion.
    draft_model: Optional draft model to use for speculative decoding.
    tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
    verbose: Print verbose output to stderr.
    type_k: KV cache data type for K (default: f16)
    type_v: KV cache data type for V (default: f16)
    spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.

Raises:
    ValueError: If the model path does not exist.

Returns:
    A Llama instance.
)disableNTizZAttempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES=Fr   utf-8   z
Value for z is too long:     zUnknown value type for z:    r    r   zModel path does not exist: )
path_modelparamsrM   )modelrV   rM   )n_tokensembd	n_seq_maxrM   z2Failed to initialize LoRA adapter from lora path: c                 p   > T R                   c  g [        R                  " T R                   5        S T l         g N)_lora_adapter	llama_cppllama_adapter_lora_freeselfs   M/var/www/html/ai-backend/venv/lib/python3.13/site-packages/llama_cpp/llama.pyfree_lora_adapter)Llama.__init__.<locals>.free_lora_adapter  s.    %%-11$2D2DE%)"    z+Failed to set LoRA adapter from lora path: file)n_vocabdtypeg      $@zFailed to load metadata: zModel metadata:  c              3  d   #    U  H&  u  pUR                  S 5      (       d  M  USS U4v   M(     g7f)ztokenizer.chat_template.
   N)
startswith).0nametemplates      rb   	<genexpr>!Llama.__init__.<locals>.<genexpr>  s3       
"79: "T"#Y!"7s   00ztokenizer.chat_templatezchat_template.defaultz&Available chat formats from metadata: z, )rq   	eos_token	bos_tokenstop_token_idszGuessed chat format: zUsing gguf chat template: zUsing chat eos_token: zUsing chat bos_token: zllama-2zUsing fallback chat format: )rM   
contextlib	ExitStack_stackr   r   _Llama__backend_initializedr   r^   llama_backend_init
isinstanceboolGGML_NUMA_STRATEGY_DISTRIBUTEGGML_NUMA_STRATEGY_DISABLEDrE   llama_numa_init
model_pathllama_model_default_paramsmodel_paramsr#   r$   r%   r&   _c_tensor_splitlenLLAMA_MAX_DEVICES
ValueErrorctypesc_floatr'   r(   r)   r*   llama_model_kv_override_kv_overrides_array	enumerateitemsencodekeyLLAMA_KV_OVERRIDE_TYPE_BOOLtagvalueval_boolintLLAMA_KV_OVERRIDE_TYPE_INTval_i64floatLLAMA_KV_OVERRIDE_TYPE_FLOATval_f64strljustLLAMA_KV_OVERRIDE_TYPE_STRtypingcast	addressofllama_model_kv_override_valueval_stroffsetPOINTERc_charmemmoveminr-   maxmultiprocessing	cpu_countr/   r0   LLAMA_DEFAULT_SEED_seedllama_context_default_paramscontext_paramsr,   r.   #LLAMA_ROPE_SCALING_TYPE_UNSPECIFIEDr1   r2   r3   r4   r5   r6   r7   r8   r9   _logits_all
embeddingsr<   r=   r>   r?   rJ   rK   r@   rA   cacherB   rC   rD   rL   ospathexistsenter_contextclosing	internals
LlamaModel_modelr   
tokenizer_n_ctx_trainLlamaContext_ctx
LlamaBatch_batchr]   llama_adapter_lora_initrW   RuntimeErrorcallbackllama_set_adapter_loractxprintllama_print_system_infodecodesysstderrrF   rG   _chat_handlersrH   rh   _n_vocab_n_ctxtoken_nl	_token_nl	token_eos
_token_eosLlamaTokenDataArray_candidatesrX   npndarrayintc	input_idssinglescores_mirostat_mumetadata	Exception	token_bostoken_get_textdictjoinkeysllama_chat_formatJinja2ChatFormatterto_chat_handler$guess_chat_format_from_gguf_metadata_sampler)?ra   r   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   kwargs
FloatArraykvo_array_lenikvv_bytesaddressbuffer_startrc   eeos_token_idbos_token_idrt   ru   template_choicesrp   rq   s?   `                                                              rb   __init__Llama.__init__<   s   V  **,G**'8,,. 9*.E'dD!!  77:: I DI99	==='8))$))4 9 % &@@B&",J, 	& (2$%-"(#(4$$%	(C(CC pqz  rM  rM  qN  O   )*E*EEJ#-$D  .2-A-AD*'1$1:1BX"&/# )#-1M11MA(D$ '|'9'9';<	6Aq23((72C((+/a&& &AA ,,ABD,,Q/55>3'' &@@ ,,@AD,,Q/55=5)) &BB ,,@AD,,Q/55=3''hhw/G7|c)(:aSqc)JKK%mmC7G &@@ ,,$kk(()A)A!)D)J)JK#AAIIPPQG
 $*;;wv}}8U#VLNN$ %'>qcA3%GHHM =T  $$-1-E-ED*5*"Nc/*C*C*E*JA&N.M/2K2K2M 9Y99
 (DDF$)!&*ll#'*4<<'B$(,%.2.B.B+ !, >> 	-
 ,8(,3N 	*  /#5O1 	+  /#5O1 	+ !1C 7Q 	, -3N 	* -3N 	* >Ka=OMUV))4)<:$)2&*5')3&!-7D*+3D( )/D&)/D&&-#"4/3
"$"$ww~~j))::,GHHkk//$$#,, LL
 $;~d'; A:KK++-Eu.DL(,(?(?(AD%*.,,D'+.t||X+FD(KK--&&++.. LL
	 kk//$$!\\"1177 LL		
 HL>>!*!B!B!!%%g."D !!)"HHXY * KK  !23//		t114??  #A$..AQR  <<)335<<WECJJW&(  	 'jjl..*$88O/1zz5("''/R.0jj D(Ugt}}ERYY/
 #NN
	H KK002DM <<$T]]O43::F~~'~~' 9E8JDKK&&|4PR 	 9E8JDKK&&|4PR 	
    
"&--"5"5"7 
 
 %58<)945 <<,8CSCXCXCZ9[8\]ZZ
 /446ND((9(M(M!## ,~	)
 o % 7 $!!)'+;;+PPK &#. <<1+?cjjQ<<45EF]5^4_` ZZ 29+>SZZP29+>SZZP#: #(9(9(A(D||243C3C2DECJJ m
 98 98z  	HDM||1!5CJJG	Hs4   ~&%!~8- 
 &
~58

A@3A@@A@c                .    U R                   R                  $ r\   )r   r   r`   s    rb   r   	Llama.ctx%  s    yy}}re   c                .    U R                   R                  $ r\   )r   rW   r`   s    rb   rW   Llama.model)  s    {{   re   c                4    U R                   S U R                   $ r\   )r   rX   r`   s    rb   
_input_idsLlama._input_ids-  s    ~~o..re   c                @    U R                   S U R                  2S S 24   $ r\   )r   rX   r`   s    rb   _scoresLlama._scores1  s    {{?T]]?A-..re   c                t    [        U R                  S U R                   R                  5       U R                  S9$ )Nmaxlen)r   r   rX   tolistr   r`   s    rb   eval_tokensLlama.eval_tokens5  s+    T^^Odmm4;;=dkkRRre   c                    [        U R                  S U R                  2S S 24   R                  5       U R                  (       a  U R
                  S9$ SS9$ )Nr   r   )r   r   rX   r  r   r   r`   s    rb   eval_logitsLlama.eval_logits9  sM    KK$--*+224"&"2"24;;
 	
89
 	
re   c                :    U R                   R                  XU5      $ )a  Tokenize a string.

Args:
    text: The utf-8 encoded string to tokenize.
    add_bos: Whether to add a beginning of sequence token.
    special: Whether to tokenize special tokens.

Raises:
    RuntimeError: If the tokenization failed.

Returns:
    A list of tokens.
)r   tokenize)ra   textadd_bosspecials       rb   r	  Llama.tokenize@  s      ''w??re   c                6    U R                   R                  XUS9$ )a  Detokenize a list of tokens.

Args:
    tokens: The list of tokens to detokenize.
    prev_tokens: The list of previous tokens. Offset mapping will be performed if provided.
    special: Whether to detokenize special tokens.

Returns:
    The detokenized string.
)prev_tokensr  )r   
detokenize)ra   tokensr  r  s       rb   r  Llama.detokenizeR  s%      ))W * 
 	
re   c                    Xl         g)z3Set the cache.

Args:
    cache: The cache to set.
N)r   )ra   r   s     rb   	set_cacheLlama.set_cachef  s	     
re   c                    Xl         g)z7Set the random seed.

Args:
    seed: The random seed.
N)r   )ra   r+   s     rb   set_seedLlama.set_seedn  s	     
re   c                    SU l         g)zReset the model state.r   N)rX   r`   s    rb   resetLlama.resetv  s	    re   c                   U R                   R                  SU R                  S5        [        S[	        U5      U R
                  5       GH.  nX[        [	        U5      X R
                  -   5       nU R                  n[	        U5      nU R                  R                  X4U R                  S9  U R                   R                  U R                  5        X0R                  XDU-   & U R                  (       ap  UnU R                  n[        R                  R                  U R                   R!                  5       Xg-  4S9nXR"                  XDU-   2SS24   R%                  S5      SS& O U =R                  U-  sl        GM1     g)zNEvaluate a list of tokens.

Args:
    tokens: The list of tokens to evaluate.
rP   r   )batchn_pastr:   )shapeN)r   kv_cache_seq_rmrX   ranger   r-   r   r   	set_batchr   r   r   r   r   	ctypeslibas_array
get_logitsr   reshape)	ra   r  r   r  r  rX   rowscolslogitss	            rb   eval
Llama.evalz  s:    			!!"dmmR8q#f+t||4As3v;LL0@ABE]]F5zHKK!!t7G7G "  IIT[[)9>NN6X$56}}..II((*4;. /  NTFh%669:BB2FrJ MMX%MM9 5re   (   ffffff?皙?皙?皙?      @c                ,  ^ ^ [         R                  " 5       nTb  SUU 4S jjnUR                  U5        UR                  T R                  UUUS9  Ub  UR                  T R                  U5        US:  a-  UR                  5         UR                  T R                  5        U$ US:X  a  UR                  5         U$ U
S:X  a-  SnUR                  T R                  T R                  UUU5        U$ U
S:X  a  UR                  T R                  UU5        U$ Sn[        SU5      nUR                  U5        UR!                  UU5        UR#                  UU5        UR%                  UU5        UR'                  U5        UR                  T R                  5        U$ )	Nc           	       > U R                   R                  nU R                   R                  n[        R                  " UR                   5      n[
        R                  " U4[
        R                  " S[
        R                  4S[
        R                  4S[
        R                  4/SS9[        R                  U-  R                  U5      S9nT H-  nU" TR                  UR                  5      UR                  S S & M/     g )NidlogitpT)align)r  rj   buf)contentssizedatar   r   r   recarrayrj   r   r   r^   llama_token_datafrom_addressr   r5  )token_data_arrayr:  data_soadata_soa_addressr<  logit_processorlogits_processorra   s         rb   
apply_func'Llama._init_sampler.<locals>.apply_func  s    '0055+4499#)#3#3H4E4E#F ;;'((7BII*>bii@PQ" #33d:HH(	 (8O(7(XHNN1% (8re   )penalty_last_npenalty_repeatpenalty_freqpenalty_presentr    r   d   rT   r   )r?  z"llama_cpp.llama_token_data_array_p)r   LlamaSampler
add_customadd_penaltiesrA   add_grammarr   add_softmaxadd_distr   
add_greedyadd_mirostatr   add_mirostat_v2r   	add_top_kadd_typical	add_top_p	add_min_padd_temp)ra   top_ktop_pmin_p	typical_ptemprepeat_penaltyfrequency_penaltypresence_penaltytfs_zmirostat_modemirostat_etamirostat_taupenalize_nlrC  grammarsamplerrD  
mirostat_mn_probsmin_keeps   `             `      rb   _init_samplerLlama._init_sampler  s   $ ((*'Y Y$ z*  22)*, 	 
	
 W5#:!TZZ(8 7 S[ 4 1 ! 
$$MMJJ  ,  !#''JJ    q'?!!%(##Ix8!!%2!!%2  &  ,re   c                >   U R                   S:  d   eSnU R                  c$  SnU R                  UUUUUUUUU	U
UUUUUS9U l        Ub  UU R                   -
  OSnU R                  c   eU R                  R	                  U R
                  U5      nU(       a  SU l        U$ )zSample a token from the model.

Args:
    top_k: The top-k sampling parameter.
    top_p: The top-p sampling parameter.
    temp: The temperature parameter.
    repeat_penalty: The repeat penalty parameter.

Returns:
    The sampled token.
r   FNTrY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rd  rc  re  rC  rf  rP   )rX   r   rk  r   sampler   )ra   rY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  re  rC  rf  idxtmp_samplerridxtokens                       rb   ro  Llama.sample  s    < }}q   == K ..#-"3!1+))'!1 / DM$ '*osT]]"2xx###$$TYY5 DMre   c              #    #    [         R                  " SU-  5      U l        U R                  UUUUUUU	U
UUUUUUUS9U l        U(       a  U R
                  S:  a  Sn[        U R                  USS 5       H  u  nnUU:X  a  US-  nM    O   US:  aG  SnUUS nUU l        U R                  (       a(  [        SU S	[        U5       S
3[        R                  S9  U(       a  U R                  5         U R
                  [        U5      -   S-
  n[        U5      n U R                  U5        UU R
                  :  a  U R!                  UUUUUUU	U
UUUUUUUUS9nUS-  nUb:  U" U R                  SU U R"                  UU R
                  -
  SS24   5      (       a  gUv nUR%                  5         UR'                  U5        Ub  UR)                  U5        UU R
                  :  aB  UU R                  U   :w  a/  UU l        U R*                  R-                  SU R
                  S5        OUU R
                  :  a  M  U R.                  b  XR0                  U R
                  U R
                  [        U5      -   & U R/                  U R0                  SU R
                  [        U5      -    5      nUR)                  UR3                  [4        5      SU R6                  U R
                  -
  [        U5      -
   5        GM  7f)aB  Create a generator of tokens from a prompt.

Examples:
    >>> llama = Llama("models/ggml-7b.bin")
    >>> tokens = llama.tokenize(b"Hello, world!")
    >>> for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.0):
    ...     print(llama.detokenize([token]))

Args:
    tokens: The prompt tokens.
    top_k: The top-k sampling parameter.
    top_p: The top-p sampling parameter.
    temp: The temperature parameter.
    repeat_penalty: The repeat penalty parameter.
    reset: Whether to reset the model state.

Yields:
    The generated tokens.
g       @rn  r   NrP   r   FzLlama.generate: z prefix-match hit, remaining z prompt tokens to evalrf   )rY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rd  rc  rC  rf  re  rp  )r   r   r   rk  r   rX   zipr   rM   r   r   r   r   r  listr*  ro  r   clearappendextendr   r   rH   r   astyper   r   )ra   r  rY  rZ  r[  r\  r]  r^  r  r_  r`  ra  rb  rd  rc  re  rC  stopping_criteriarf  longest_prefixab
sample_idxrs  tokens_or_nonedraft_tokenss                             rb   generateLlama.generate6  s    R #NN3+=>**)/-'%%#- + 
& T]]Q&NDOOVCR[916"a'N	 :
 !0 .<<*>*: ;%%([M1GI ZZ JJL ]]S[014
f IIft}},'#1&7%5"/!-!-%5# +"! $ & a
$05FOOLj14<<
T]]@Z\]@]3^6 6 ',e$!-MM.1-%4??:;V2V$.DMII--b$--DE t}},H +NTt}}t}}s6{/JK#//NN#@T]]S[%@A   '',C$++5FCW s   H1K15B<K1c                    Ub  UOU R                   n[        U[        5      (       a  UOU/nU R                  USS9u  pE[	        U5       VVs/ s H  u  pgSUUS.PM     nnnSUUUUS.S.$ s  snnf )zhEmbed a string.

Args:
    input: The utf-8 encoded string to embed.

Returns:
    An embedding object.
T)return_countr;   )objectr;   indexrw  )prompt_tokenstotal_tokens)r  r;  rW   usage)r   r|   rw  embedr   )	ra   inputrW   
model_nameembedsr  rp  embr;  s	            rb   create_embeddingLlama.create_embedding  s     $)#4%$//
#E400ug
  $zz%dzC &f-!
 .	 & 
 . 	 !
 !- ,	
 	
!
s   A+c                v  ^ ^^^^ T R                  5       mT R                  nT R                  5       mT[        R                  :H  nT R
                  R                  SL a  [        S5      eT R                  (       a*  [        R                  " T R                  R                  5        [        U[        5      (       a  U/nOUnT R                  R                  5         / mS
UUUUU 4S jjnSn	/ n
SnSnU H  nT R!                  UR#                  S5      5      nU(       a  USU n[%        U5      nX-  n	X:  a  ['        SU SU 35      eX-   U:  a  U" U
5        / n
SnSnT R                  R)                  XU5        U
R+                  U5        X-  nUS	-  nM     U" U
5        T R                  (       a*  [        R,                  " T R                  R                  5        [        U[        5      (       a  TS   OTn[        R.                  " T R                  R                  5        T R                  5         U(       a  UU	4$ U$ )zhEmbed a string.

Args:
    input: The utf-8 encoded string to embed.

Returns:
    A list of embeddings
FzCLlama model must be created with embedding=True to call this methodc           	     b  > [         R                  " TR                  R                  5        TR                  R	                  TR
                  5        TR
                  R                  5         T[         R                  :X  a  Sn[        U 5       H  u  p#[         R                  " TR                  R                  5      n[        U5       Vs/ s H  nXAUT	-  -   XS-   T	-  -    PM     nnT
(       a&  U Vs/ s H  n[        R                  " U5      PM     nnTR                  U5        X-  nM     g [        [        U 5      5       Ha  n[         R                  " TR                  R                  U5      nUS T	 nT
(       a  [        R                  " U5      nTR                  U5        Mc     g s  snf s  snf Nr   r   )r^   llama_kv_self_clearr   r   r   r   r  LLAMA_POOLING_TYPE_NONEr   llama_get_embeddingsr!  r   normalize_embeddingry  r   llama_get_embeddings_seq)	seq_sizesposr   r:  ptrjr;   r   r;  n_embd	normalizer2   ra   s           rb   decode_batch!Llama.embed.<locals>.decode_batch  sa   ))$))--8IIT[[)KK y@@@(3GA#88GC "'t4!,A !f*,s!ev5E/EF!,  4 !FO%FOI99!<i " % KK	*KC  4 s9~.A#<<TYY]]ANC-0&\I $-$A$A)$L	KK	* /4
%s   F'1 F,r   rQ   NRequested tokens (z) exceed batch size of r   )r  	List[int])r  r-   r2   r^   r  r   r   r   rM   llama_perf_context_resetr   r   r|   r   r   r  r	  r   r   r   add_sequencery  llama_perf_context_printr  )ra   r  r  truncater  r-   r:   inputsr  r  s_batcht_batchp_batchr
  r  rX   outputr;  r  r2   s   ` `              @@@rb   r  Llama.embed  s    ,, ((*!Y%F%FF
))U2U  <<..tyy}}=eS!!WFF 	 CE	+ 	+:  D]]4;;w#78F)6{H$L ! (
2I'S 
 !G+W% KK$$VjA NN8$GqLG7 < 	W<<..tyy}}=&uc22a%%diimm4

<''Mre      c              #  !  ^]^^#    Ub  UR                   [        L d   eS[        [        R                  " 5       5       3n[	        [
        R
                  " 5       5      nU R                  5       nU R                  R                  5       nU R                  R                  5       nSnSn Sn!U R                  R                  SS5      S:H  n"US:w  a  UOU/n#US:w  a  UOU R                  5       /n$[        U[        5      (       a  Ub)  U R                  R                  5       (       a
  U#S S S/:X  a  / n#[        U[        5      (       a  Ub%  U R                  R!                  5       (       d  US:X  a  / n$Sn%U"(       a  U!S:  a  U(       a  SU-   nSn%[#        U5      S:  a  / OU/n&US:  a  Ub  U/O/ [        U[        5      (       a6  US	:w  a.  U R%                  UR'                  S
5      SUS:  =(       d    US L S9O/ OU-   n'U!S:  a3  Ub0  U!/U(       a#  U R%                  UR'                  S
5      SSS9U%S  O/ -   O/ n(U S:  a  Ub  U /O/ n)U#U R(                  (       a  U(U'-   U)-   OU'U(-   U)-   -   U$-   n*Sn+Sn,[        U
[        5      (       a  U
O[        U
[        5      (       a  U
/O/ n
Ub  UOU R*                  n-U*S S U R                  5       /S-  :X  aF  [,        R.                  " SU R                  R1                  U R                  5       5       S3[2        5        Ubj  UR5                  5        V.V/s0 s H  u  n.n/[	        U.5      [7        U/5      _M     sn/n.m]      S,U]4S jjn0[9        U0/5      n1Uc  U1nOUR;                  U15      nU R<                  (       a  U R>                  RA                  5         [#        U*5      U RB                  :  a8  [E        S[#        U*5       S[F        RH                  " U RJ                  5       35      eUb  US::  a  U RB                  [#        U*5      -
  nU[#        U*5      -   U RB                  :  a  UOU RB                  [#        U*5      -
  nU
/ :w  a"  U
 V2s/ s H  n2U2R'                  S
5      PM     n3n2O/ n3Ub  U RL                  SL a  [E        S5      eU RN                  (       a   U RN                  U*   n4[P        RS                  U4RT                  RW                  5       U*5      n5[P        RS                  U RX                  RW                  5       U*5      n6U5U6:  a:  U R[                  U45        U R<                  (       a  []        S[^        R`                  S9  Ub  U Re                  U5        O?U Re                  [f        Rh                  " U Rj                  5      Rm                  SS5      5        Sn7Sn8U Ro                  U*UUUUUUUUUUUUUUUS9 GH9  n9[F        Rp                  " U R                  Rr                  U95      (       a  U Ru                  U&U*S9n+Sn7  GOU&Rw                  U95        U Ru                  U&U*S9n:[y        U:SS  5       H/  u  n.n;SU.-
  n.S H  u  n<n=U<U.:  d  M  U=U;-  U=:X  d  M  U<U.-
  n8M      M1     U8S:  a  U8S-  n8M  U3 V2s/ s H  n2U2U:;   d  M  U2PM     n>n2[#        U>5      S:  a  U>S   n?U:S U:R{                  U?5       n+Sn7  GO?U(       Ga  U&U,S  n@U Ru                  U@U*U&S U, -   S9m^[#        T^5      nASnBU3 HN  n2[}        [        [#        U25      WA5      SS5       H'  nCT^R                  U2S UC 5      (       d  M  WCWB:  a  WCnB  ML     MP     SnDUGb  W@ GH  n9U9U:X  a  M  WD[#        U Ru                  U9/U*U&S U, -   S95      -  nDUDWAWB-
  :  a    GOPU Ru                  U9/U*U&S U, -   S9R                  S
SS 9nE[#        U5      [#        U Ru                  U&S U, U*U&S U, -   S9R                  S
SS 95      -   nF[#        U*5      U,-   nGU R                  UGS-
  S S 24   nH[P        R                  UH5      RW                  5       nI[        [        [        UI[}        [#        UI5      5      5      S!S"95      nJUJS U  VKVCs0 s H'  u  nKnCU Ru                  UC/5      R                  S
SS 9UK_M)     nLnKnCULR                  WEWI[	        U95         05        U Ru                  U9/U*U&S U, -   S9R                  S
SS 9/WF/UI[	        U95         /UL/S#.nMU,S-  n,US$UU-U Ru                  U9/U*U&S U, -   S9R                  S
SS 9SUMS S%./S&.v   GM     O[#        W@5      S:  a  SnN[}        S[#        W@5      S-   5       H0  nC U Ru                  W@S UC U*U&S U, -   S9nOUOR                  S
5      nPS!nN  O   OKWN(       d  OCWD[#        WO5      -  nDUDWAWB-
  :  a  O+W@WCS  n@U,UC-  n,US$UU-WPSS S S%./S&.v   [#        U@5      S:  a  M  [#        U&5      U:  d  GM(  U Ru                  U&U*S9n+Sn7  O   Ub;  U" U RX                  U R                  SS S 24   5      (       a  U Ru                  U&U*S9n+Sn7U R<                  (       a  U R>                  R                  5         U(       Ga  U&U,S  n@U Ru                  U@U*U&S U, -   S9m^U3 V2s/ s H  n2U2T^;   d  M  U2PM     n>n2[#        U>5      S:  a  [        U^4S' jU> 5       5      nQO[#        T^5      nQSnDW@ GH  n9WD[#        U Ru                  U9/U*U&S U, -   S95      -  nDS nMUGbY  U9U:X  a  M5  U Ru                  U9/5      R                  S
SS 9nE[#        U5      [#        U Ru                  U&S U, U*U&S U, -   S95      -   nF[#        U*5      U,-   S-
  nGU R                  UGS S 24   nH[P        R                  UH5      RW                  5       nI[        [        [        UI[}        [#        UI5      5      5      S!S"95      nJUJS U  VKVCs0 s H'  u  nKnCU Ru                  UC/5      R                  S
SS 9UK_M)     nLnKnCULR                  WEWI[	        U95         05        U Ru                  U9/5      R                  S
SS 9/WF/UI[	        U95         /UL/S#.nMWDWQ:  aT  U Ru                  U9/5      nRWDWQS-
  :X  a    OoU,S-  n,US$UU-WRS [#        UR5      WDWQ-
  -
   R                  S
SS 9SWMS S%./S&.v     O8U,S-  n,US$UU-U Ru                  U9/5      R                  S
SS 9SWMS S%./S&.v   GM     US$UU-S	SS U7S%./S&.v   U RN                  (       ar  U R<                  (       a  []        S([^        R`                  S9  U R                  5       U RN                  U*U&-   '   U R<                  (       a  []        S)[^        R`                  S9  g U RN                  (       aI  U R<                  (       a  []        S([^        R`                  S9  U R                  5       U RN                  U*U&-   '   U+R                  S
SS 9nSU	(       a  UWS-   nSU!S:  a  Ub  WSU-   nSS nMUGb  U	(       a  SO
[#        U5      nFU	(       a  SO[#        U*SS  5      nG/ nT/ nU/ nV/ nWU	(       a"  U*U*S   U R                  5       :X  a  SOSS  U&-   nXOU&nX[y        WX5       VCV9s/ s H(  u  nCn9U Ru                  U9/WXS UC S9R                  S
SS 9PM*     nYnCn9[P        R                  U R                  5      WGS  nZ[y        [        WXUYUZ5      5       GH  u  n[u  n9nEn\U9U:X  a  M  WTRw                  WF[#        U Ru                  WXS W[ 5      R                  S
SS 95      -   5        WVRw                  WE5        [        [        [        W\[}        [#        U\5      5      5      S!S"95      nJWURw                  U\[	        U95         5        UJS U  VKVCs0 s H)  u  nKnCU Ru                  UC/WXS W[ S9R                  S
SS 9UK_M+     nLnKnCULR                  WEW\[	        U95         05        WWRw                  UL5        GM     U	(       a  [#        WX5      S:  a
  S WUS'   S WWS'   WVWTWUWWS#.nMUS$UU-WSSWMU7S%./[#        U*5      [#        U&5      [#        U*5      [#        U&5      -   S*.S+.v   g s  sn/n.f s  sn2f ! [b         a-    U R<                  (       a  []        S[^        R`                  S9   GN\f = fs  sn2f s  snCnKf ! [         a     GMm  f = fs  sn2f s  snCnKf s  sn9nCf s  snCnKf 7f)-Nzcmpl-r   ztokenizer.ggml.add_space_prefixtruerP   r   u   ☺rT   rk   rQ   F)r  r  re   zDetected duplicate leading "zN" in prompt, this will likely reduce response quality, consider removing it...c                x   > [         R                  " U5      nTR                  5        H  u  p4XAU   -   X#'   M     U$ r\   )r   copyr   )r   r   
new_scoresinput_idscorelogit_bias_maps        rb   logit_bias_processor6Llama._create_completion.<locals>.logit_bias_processor  sC      WW
 (6';';'=OH+0(3C+CJ( (>!!re   r  z) exceed context window of zBlogprobs is not supported for models created with logits_all=Falsez#Llama._create_completion: cache hitrf   z$Llama._create_completion: cache missl        length)rY  rZ  r[  r\  r]  ra  rb  rd  rc  r_  r`  r^  r|  rC  rf  )r  stop   ))rT      )r     )      ignore)errorsT)reverse)r  text_offsettoken_logprobstop_logprobstext_completion)r
  r  logprobsfinish_reason)r4  r  createdrW   choicesc              3  F   >#    U  H  nTR                  U5      v   M     g 7fr\   )r  )ro   r  remaining_texts     rb   rr   +Llama._create_completion.<locals>.<genexpr>  s     J...t44s   !z$Llama._create_completion: cache savez%Llama._create_completion: cache saved)r  completion_tokensr  )r4  r  r  rW   r  r  r   npt.NDArray[np.intc]r   npt.NDArray[np.single]returnr  )J	__class__r   uuiduuid4r   timer   r   	token_cls	token_sepr   getr   r|   rw  add_bos_tokenadd_eos_tokenr   r	  r   rL   r   warningswarnr   RuntimeWarningr   r   LogitsProcessorListrz  rM   r   reset_timingsr   r   r^   llama_n_ctxr   r   r   r   longest_token_prefixr   r  r   
load_stater   r   r   KeyErrorr  randomRandomr   randintr  llama_token_is_eogvocabr  ry  r   r  r!  r   endswithr   r   logits_to_logprobssortedrv  updateUnicodeErrorprint_timings
save_state)_ra   promptsuffix
max_tokenstemperaturerZ  r[  r\  r  echor  r_  r`  r^  rY  streamr+   ra  rb  rd  rc  rW   r|  rC  rf  
logit_biascompletion_idr  r   cls_token_idsep_token_idprefix_token_idmiddle_token_idsuffix_token_idadd_space_prefix
bos_tokens
eos_tokenssuffix_space_prefixr  prefix_tokenssuffix_tokensmiddle_tokensr  r
  returned_tokensr  r   r   r  _logit_bias_processorsstop_sequences
cache_itemcache_prefix_leneval_prefix_lenr  multibyte_fixrs  all_textcharnumpatternany_stop
first_stopremaining_tokensremaining_lengthfirst_stop_positionr   token_end_position	token_strr  token_offsetr)  current_logprobssorted_logprobslogprobtop_logproblogprobs_or_nonedecode_successbstsend	last_texttext_strtext_offsetsr  r  r  
all_tokensall_token_strsall_logprobsrp  logprobs_tokenr  r  s_                                                                                                @@rb   _create_completionLlama._create_completionc  s    < ~!1!1S!888$S%6$78499;' NN, KK113 KK113   MM?HFR 	 2>1C V
(B.LDNN4D!


 %%&.;;,,.."1~"%Jvt$$))++0BJ#$1 4V^F"# .1[1_r<. "1A!5&:L_RT &#&& R< MM'*!,q0BFdN    $
4 !#(: !!
  MM&--"8%QVMW+,   	 "1A!5&:L_RT 	  ?? .>#m3mC	  	  tT**D*T3:O:OUW 	 $)#4%$//
!!1 2Q 66MM.t{{/I/I$..JZ/[.\  ]k  l !;E;K;K;MN;M41ac!feAh.;MNN	"/	".	" (	" %89M8N$O!'#8 #3#:#:;P#Q <<II##%},$S%7$88ST]TiTijnjrjrTsStu  qs='99J
 C..< ++M 22 	 2:9=>Aahhw/N>NND$4$4$=T  ::S!ZZ6
#(#=#=((//1=$  #("<"<OO**,m# $o5OOJ/||C#**U
 MM$MM&--

3;;AwGH ]]'%%/-)/-! # 
E$ ++DKK,=,=uEE'8mT &$$U+'8mTH %Xbc]34E$BLCQw7T>W#<(+a %C 4 q "#1C>aQ(]>HC8}q %a[
 <(.."<= &#4_5E#F !%$ -0ABR?0S S "1 " $'~#6 
 '(#'A"3s1v/?#@!RH)221Ra599 #6667 3!	 I ( &'"' "2 L0$*c OO!&,9"34D_"E-F , / * .,/BB "$(OO"G(5/0@A)B %4 % !&&:	 "
 '*&kC OO 12B? C,9"34D_"E-F ,  %fWXf>5 ' (+='9O'K!%lQ.>.A!B+0+C+CF+K+R+R+T(*." #$4eC@P<Q6R S(,+ />ix.H	' /I
 !OOQC077 ' 8 &' /I	 $ ' $**I7GE
7S+TU !%%*G0=&78H&I1J !0 !" #)&&"B' -8=/?E
/K.L-8M,( (1,"/&7'.%/ -1OO).4A*;<L_*M5N -< -& '-fWXf&F-.0@59	!"( q "2V ./!3).!&q#.>*?!*C!DA
%%)__$4Ra$80=&78H&I1J &5 &"
 &(YYw%715 % "E "-!*c"g5*-,/BB "+;AB+?('1, #0&7'.%/ -/-.0459	!"( 7 ./!3T $%3'8mT (
B (->OOT\\"a%0.
 .
 ??#4-?PD"M<<II##%01AB!__ ),=>N,OO - N $2I>aQ.5H>HI8}q JJJ.)!")"cOO$14EFV4W$W $ ' " BF ',  $ 8 ? ? !@ !I #&f+-.>?(5/0@A)B ( 1 #K $'}#5#G!#KL!\\,/:F','?'?'G'N'N'P$&* 0%<L8M2NO$('O +:)8*D#*DJGQ ,33GH3MwV*D   #  &&	3CCJ3O'PQ !OOUG4;;GH;U# )4}+;CJ+G*H)4($ &, $ 8I)S1W4#q(O+"3#*!+ )2$Qc)n8JS8P&Q)""(&&"B)*,<15	$   1$'/&' %)OOUG$<$C$C ' %D % &'(8-1	  I *j $+"# !#!"$()6	  zz<<@szzR@D@Q

=+<<=<<A

S::||<3::N<@OO<MDJJ}'889;;wx;8(HQ6#5&(H9=#!VK $1#mAB.?*@L&(L46N "F=?L "}Q'74>>;K'K!QR"TU'( 
 /
 !** 5	 !6HAu Z^DKKH L  !6	   !33DLLA,-PL;DJ=<77eY L(##
4C(89@@#H A  i("&NE#n2E,FGQU#
 %%nSZ&@A
 '6ix&@	; 'A
 OOQCZ5EOFMM N  'A	  ; ""I~c%j/I#JK##K07<> J!+$(q!"&Q +"0 ,	   ' % 0%2	 "%]!3%():%; #M 2S9J5K K
 	
s ON ?,  S<<@szzRSf DF'f $0 % $%d JL#T8;s  L!AC'%"AA.DAC'AA443AC'(B+AA9 DAC'AC''AC'
AB3AB3BAC';D3AC'..AB8
CAC'#,AB>AAC'AC'1B&AC'
AC%AC+D7AC'".ACI3AC'/AC2C0AC'"0AC!B'AC'A93AB0B,AC'B/AB0B0AC'B>
ACCAC'CACCAC'c                    U R                  UUUc  SOUUUUUUU	U
UUUUUUUUUUUUUUUS9nU(       a  UnU$ [        U5      nU$ )
  Generate text from a prompt.

Args:
    prompt: The prompt to generate text from.
    suffix: A suffix to append to the generated text. If None, no suffix is appended.
    max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
    temperature: The temperature to use for sampling.
    top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
    typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
    logprobs: The number of logprobs to return. If None, no logprobs are returned.
    echo: Whether to echo the prompt.
    stop: A list of strings to stop generation when encountered.
    frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
    presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
    repeat_penalty: The penalty to apply to repeated tokens.
    top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    stream: Whether to stream the results.
    seed: The seed to use for sampling.
    tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
    mirostat_mode: The mirostat sampling mode.
    mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    model: The name to use for the model in the completion object.
    stopping_criteria: A list of stopping criteria to use.
    logits_processor: A list of logits processors to use.
    grammar: A grammar to use for constrained sampling.
    logit_bias: A logit bias to use.

Raises:
    ValueError: If the requested tokens exceed the context window.
    RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.

Returns:
    Response object containing the generated text.
rP   r  r  r  r  rZ  r[  r\  r  r  r  r_  r`  r^  rY  r  r+   ra  rb  rd  rc  rW   r|  rC  rf  r  )r+  next)ra   r  r  r  r  rZ  r[  r\  r  r  r  r_  r`  r^  rY  r  r+   ra  rb  rd  rc  rW   r|  rC  rf  r  completion_or_chunkschunks
completions                                rb   create_completionLlama.create_completion  s    @  $66'/rZ#/-)'%%/-!3  7  
6 ?SFM!%&:!;
re   c                P    U R                  UUUUUUUUU	U
UUUUUUUUUUUUUUUS9$ )r.  r/  )r4  )ra   r  r  r  r  rZ  r[  r\  r  r  r  r_  r`  r^  rY  r  r+   ra  rb  rd  rc  rW   r|  rC  rf  r  s                             rb   __call__Llama.__call__0  sh    @ %%!#/-)'%%/-!3 & 
 	
re   g?c                z   U R                   =(       dL    U R                  R                  U R                  5      =(       d     [        R
                  " U R                  5      nU" S0 SU _SU_SU_SU_SU_SU_SU_SU_S	U_S
U	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_6$ )a  Generate a chat completion from a list of messages.

Args:
    messages: A list of messages to generate a response for.
    functions: A list of functions to use for the chat completion.
    function_call: A function call to use for the chat completion.
    tools: A list of tools to use for the chat completion.
    tool_choice: A tool choice to use for the chat completion.
    temperature: The temperature to use for sampling.
    top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
    typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
    stream: Whether to stream the results.
    stop: A list of strings to stop generation when encountered.
    seed: The seed to use for sampling.
    response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json.
    max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
    presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
    frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
    repeat_penalty: The penalty to apply to repeated tokens.
    tfs_z: The tail-free sampling parameter.
    mirostat_mode: The mirostat sampling mode.
    mirostat_tau: The mirostat sampling tau parameter.
    mirostat_eta: The mirostat sampling eta parameter.
    model: The name to use for the model in the completion object.
    logits_processor: A list of logits processors to use.
    grammar: A grammar to use.
    logit_bias: A logit bias to use.

Returns:
    Generated chat completion or a stream of chat completion chunks.
llamamessages	functionsfunction_calltoolstool_choicer  rZ  rY  r[  r\  r  r  r  r  r+   response_formatr  r`  r_  r^  ra  rb  rd  rc  rW   rC  rf  r   )rG   r   r  rF   r   get_chat_completion_handler)ra   r;  r<  r=  r>  r?  r  rZ  rY  r[  r\  r  r  r+   r@  r  r`  r_  r^  ra  rb  rd  rc  rW   rC  rf  r  r  r  handlers                                 rb   create_chat_completionLlama.create_chat_completion  sz   F  O""&&t'7'78O <<T=M=MN 	
  


  
 (	

 
 $
 $
 
 
 
  
 
 &
 
 
  !
" ,#
$ "%
& .'
( 0)
* *+
, -
. (/
0 &1
2 &3
4 5
6 .7
8 9
: ";
 	
re   c           	       ^  SSK JnJm  UR                  SS5      n[	        U[
        5      (       d   eU(       a  U4S jU R                  " U0 UD6 5       $ U" S0 U R                  " U0 UD6D6$ ! [         a    [        S5      ef = f)a  Generate a chat completion with return type based on the the OpenAI v1 API.

OpenAI python package is required to use this method.

You can install it with `pip install openai`.

Args:
    *args: Positional arguments to pass to create_chat_completion.
    **kwargs: Keyword arguments to pass to create_chat_completion.

Returns:
    Generated chat completion or a stream of chat completion chunks.
r   )ChatCompletionChatCompletionChunkr  Fc              3  4   >#    U  H  nT" S0 UD6v   M     g 7fNrA  rA  )ro   chunkrH  s     rb   rr   9Llama.create_chat_completion_openai_v1.<locals>.<genexpr>  s     oBn+4e4Bns   zzTo use create_chat_completion_openai_v1, you must install the openai package.You can install it with `pip install openai`.rA  )openai.types.chatrG  rH  r  r|   r}   rD  ImportError)ra   argsr   rG  r  rH  s        @rb    create_chat_completion_openai_v1&Llama.create_chat_completion_openai_v1  s    $	MZZ%0Ffd++++o$B]B]_cBngmBnoo%U(C(CT(TV(TUU 	@ 	s   AA/ A/ /Bc                   [        S,0 SU R                  _SU R                  R                  _SU R                  R                  _SU R                  R
                  _SU R                  _SU R                  R                  _SU R                  R                  _SU R                  R                  _S	U R                  _S
U R                  _SU R                  R                  _SU R                  _SU R                  R                  _SU R                  R                   _SU R                  R"                  _SU R                  R$                  _SU R                  R&                  _SU R                  R(                  _SU R                  R*                  _SU R                  R,                  _SU R                  R.                  _SU R                  R0                  _SU R                  R2                  _SU R                  R4                  _SU R6                  _SU R                  R8                  _SU R                  R:                  _SU R                  R<                  _SU R                  R>                  _SU R                  R@                  _SU R                  RB                  _S U RD                  _S!U RF                  _S"U RH                  _S#U RJ                  _S$U RL                  _S%U RN                  _S&U RP                  _S'U RR                  _S(U R                  RT                  _S)U R                  RV                  _S*U RX                  _S+U RZ                  _6$ )-Nr   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rJ   rK   rL   rM   rA  ).r   r   r   r#   r$   r%   r&   r'   r(   r)   r*   r   r   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r   r   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rJ   rK   rL   rM   r`   s    rb   __getstate__Llama.__getstate__  si    5
5
 **775
 ((33	5

 &&//5
 **5
 ((335
 &&//5
 ''115
 **5
 5
 %%++5
 LL5
 ((115
  ))33!5
" !//??#5
$ #11CC%5
& ,,99'5
(  ..==)5
* !//??+5
, !//??-5
. "00AA/5
0  ..==15
2  ..==35
4 --;;55
6 ''75
8 ))4495
: ++77;5
< **55=5
> **55?5
@ ((11A5
D ''//E5
F  $66G5
J nnK5
L M5
N nnO5
R S5
V ((W5
X **Y5
\ ((]5
` &&--a5
b &&--c5
f g5
h LLi5
 5	
re   c                (    U R                   " S0 UD6  g rJ  )r   )ra   states     rb   __setstate__Llama.__setstate__L  s    re   c           	     ,   U R                   (       a  [        S[        R                  S9  [        R
                  " U R                  R                  5      nU R                   (       a  [        SU 3[        R                  S9  [        R                  [        U5      -  " 5       nU R                   (       a  [        S[        R                  S9  [        R                  " U R                  R                  U5      nU R                   (       a  [        SU 3[        R                  S9  [        U5      [        U5      :  a  [        S5      e[        R                  [        U5      -  " 5       n[        R                  R                  XB[        U5      5        U R                   (       a  [        SU S3[        R                  S9  [        U R                  R!                  5       U R"                  R!                  5       U R$                  ['        U5      UU R(                  S	9$ )
Nz$Llama.save_state: saving llama staterf   z"Llama.save_state: got state size: z!Llama.save_state: allocated statez&Llama.save_state: copied llama state: zFailed to copy llama state datazLlama.save_state: saving z bytes of llama state)r   r   rX   llama_statellama_state_sizer+   )rM   r   r   r   r^   llama_get_state_sizer   r   r   c_uint8r   llama_copy_state_datar   r   
LlamaStater   r  r   rX   bytesr   )ra   
state_sizerZ  n_bytesllama_state_compacts        rb   r  Llama.save_stateO  sh   <<8szzJ33DIIMMB
<<6zlC#**U~~J7:<<5CJJG11$))--M<<:7)D3::Vw<#j/)@AA%~~G<?  !43w<P<<+G94IJZZ <<$$&nn))+]]12$
 	
re   c                (   UR                   R                  5       U R                   S UR                  2S S 24'   U R                   UR                  S 2S S 24   nSX"S:  '   UR                  R                  5       U l        UR                  U l        UR                  U l        UR                  n[        R                  U-  nUR                  UR                  5      n[        R                  " U R                  R                  U5      U:w  a  [        S5      eg )Nr    r   zFailed to set llama state data)r   r  rX   r   r+   r   r[  r   r]  from_buffer_copyrZ  r^   llama_set_state_datar   r   r   )ra   rV  restra  LLamaStateArrayTyperZ  s         rb   r  Llama.load_statem  s    +0<<+<+<+>$enn$a'({{5>>+Q./AX--/ZZ
++
$nnz9)::5;L;LM))$))--ES?@@ Tre   c                6    U R                   R                  5       $ )zReturn the context window size.)r   r,   r`   s    rb   r,   Llama.n_ctx|  s    yy  re   c                6    U R                   R                  5       $ )zReturn the embedding size.)r   r  r`   s    rb   r  Llama.n_embd  s    {{!!##re   c                6    U R                   R                  5       $ )zReturn the vocabulary size.)r   rh   r`   s    rb   rh   Llama.n_vocab  s    {{""$$re   c                    [        U 5      $ )z*Return the llama tokenizer for this model.)r   r`   s    rb   rI   Llama.tokenizer  s    d##re   c                6    U R                   R                  5       $ )z!Return the end-of-sequence token.)r   r   r`   s    rb   r   Llama.token_eos      {{$$&&re   c                6    U R                   R                  5       $ )z'Return the beginning-of-sequence token.)r   r   r`   s    rb   r   Llama.token_bos  ru  re   c                6    U R                   R                  5       $ )zReturn the newline token.)r   r   r`   s    rb   r   Llama.token_nl  s    {{##%%re   c                6    U R                   R                  5       $ )zReturn the pooling type.)r   r2   r`   s    rb   r2   Llama.pooling_type  s    yy%%''re   c                8    U R                   R                  5         g)z&Explicitly free the model from memory.N)ry   closer`   s    rb   r}  Llama.close  s    re   c                $    U R                  5         g r\   )r}  r`   s    rb   __del__Llama.__del__  s    

re   c                   [         R                  " XSS9nUR                  S:  a  SU[         R                  " U5      ) '   O[         R                  " U5      (       d  Sn[         R                  " X[         R
                  S9n[         R                  " U5      n[         R                  " SS9   [         R                  " XASS9n[         R                  " U5      nS S S 5        X6-
  $ ! , (       d  f       UW-
  $ = f)NT)axiskeepdimsr   ri   r  )divide)
r   amaxndimisfinitesubtractr   experrstatesumlog)r)  r  logits_maxssubtract_maxsr  summedouts          rb   r  Llama.logits_to_logprobs  s    
 #%''&d"Ka56K[112[))KFryyIff]#[[)VVCT:F&&.C * "" *) s""s   +,C##
C5c                L    Sn[        X5       H  u  p4X4:X  a  US-  nM    U$    U$ r  )rv  )r~  r  r}  _a_bs        rb   r  Llama.longest_token_prefix  s7    !iFBx!#  
 re   c                *    SSK JnJn	  SSKJn
  U
" U5        U	" 5       nUR                  USS9 Vs/ s H  n[        U[        5      (       a  US   OUPM!     nn/ nU H7  n[        U5      R                  U5      nUR                  [        U5      5        M9     U Vs/ s H"  n[        R                  " X5      (       d  M   UPM$     nn[        U5      S:X  a(  [        SU S	U S
[         R"                  " U5       35      e[        U5      S:  a(  [        SU SU S
[         R"                  " U5       35      eUu  n[        [        U5      R$                  5      n[        U5      R&                  nU" UUUUUUS9  U(       a  U H  nU Vs/ s H#  n[        R                  " UU5      (       d  M!  UPM%     nn[        U5      S:X  a(  [        SU S	U S
[         R"                  " U5       35      e[        U5      S:  a(  [        SU SU S
[         R"                  " U5       35      eUu  nU" UUUUUUS9  M     Uc  U" UUUUUUSS9nO[(        R*                  R-                  XB5      nU " SSU0UD6$ ! [
         a    [        S5      ef = fs  snf s  snf s  snf )as  Create a Llama model from a pretrained model name or path.
This method requires the huggingface-hub package.
You can install it with `pip install huggingface-hub`.

Args:
    repo_id: The model repo id.
    filename: A filename or glob pattern to match the model file in the repo.
    additional_files: A list of filenames or glob patterns to match additional model files in the repo.
    local_dir: The local directory to save the model to.
    local_dir_use_symlinks: Whether to use symlinks when downloading the model.
    **kwargs: Additional keyword arguments to pass to the Llama constructor.

Returns:
    A Llama model.r   )hf_hub_downloadHfFileSystem)validate_repo_idzrLlama.from_pretrained requires the huggingface-hub package. You can install it with `pip install huggingface-hub`.T)	recursiverp   zNo file found in z that match z

Available Files:
r   zMultiple files found in z
 matching )repo_idfilename	subfolder	local_dirlocal_dir_use_symlinks	cache_dir)r  r  r  r  r  r  local_files_onlyr   rA  )huggingface_hubr  r  huggingface_hub.utilsr  rN  lsr|   r   r   relative_tory  r   fnmatchr   r   jsondumpsparentrp   r   r   r   )clsr  r  additional_filesr  r  r  r   r  r  r  hffsrg   files	file_listrel_pathmatching_filesmatching_filer  additonal_file_namematching_additional_filesmatching_additional_filer   s                          rb   from_pretrainedLlama.from_pretrained  s   2	E> 	!~ 48
8 'tT22DL<8 	 
  "	DDz--g6HS]+ 
 ,5X948W$9X~!##G9L
 C%%)ZZ	%:$;= 
 ~"*7):hZ H%%)ZZ%6$79 
 *]+223	&++ 	#9	
 '7#>G,vid7??[_atKuTi),v01Q6$+G9LAT@U V--1ZZ	-B,CE 
 01A5$27):FYEZ [--1ZZ->,?A 
 /H+)  #5''+A'' (88 (!##'=#!%J i:J  
!

 	
}  	I 	
 Y@ -ws(   I- &JJ?J J+J-J))r   r   r   r   r   r   r   r]   r   r   r   r   r   r   ry   r   r   r   rF   rG   r   rH   r   r*   rA   rB   rD   rC   r   r   r   r-   r/   r0   rX   rE   r   rL   r&   r   rM   )Xr   r   r#   r   r$   r   r%   r   r&   zOptional[List[float]]r'   r}   r(   r}   r)   r}   r*   z1Optional[Dict[str, Union[bool, int, float, str]]]r+   r   r,   r   r-   r   r.   r   r/   Optional[int]r0   r  r1   r  r2   r   r3   r   r4   r   r5   r   r6   r   r7   r   r8   r   r9   r   r:   r}   r;   r}   r<   r}   r=   r}   r>   Optional[bool]r?   r  r@   r}   rA   r   rB   Optional[str]rC   r   rD   r  rE   zUnion[bool, int]rF   r  rG   z6Optional[llama_chat_format.LlamaChatCompletionHandler]rH   zOptional[LlamaDraftModel]rI   zOptional[BaseLlamaTokenizer]rJ   r  rK   r  rL   r}   rM   r}   )r  zllama_cpp.llama_context_p)r  zllama_cpp.llama_model_p)r  r  )r  r  )r  z
Deque[int])r  zDeque[List[float]])TF)r
  r`  r  r}   r  r}   r  r  )NF)r  r  r  zOptional[List[int]]r  r}   r  r`  )r   zOptional[BaseLlamaCache])r+   r   )r  Sequence[int])r,  r-  r.  r!   r/  r!   r    r    r!   r   r0  r1  TNN)rY  r   rZ  r   r[  r   r\  r   r]  r   r^  r   r_  r   r`  r   ra  r   rb  r   rc  r   rd  r   re  r}   rC  Optional[LogitsProcessorList]rf  Optional[LlamaGrammar])r,  r-  r.  r!   r/  r!   r    r    r!   r   r0  r1  TNNN) rY  r   rZ  r   r[  r   r\  r   r]  r   r^  r   r_  r   r`  r   ra  r   rb  r   rc  r   rd  r   re  r}   rC  r  rf  r  rp  r  )r,  r-  r.  r!   r/  r!   Tr    r    r!   r   r1  r0  TNNN)&r  r  rY  r   rZ  r   r[  r   r\  r   r]  r   r^  r   r  r}   r_  r   r`  r   ra  r   rb  r   rd  r   rc  r   re  r}   rC  r  r|  Optional[StoppingCriteriaList]rf  r  r  z-Generator[int, Optional[Sequence[int]], None]r\   )r  Union[str, List[str]]rW   r  r  CreateEmbeddingResponse)FTF)r  r  r  r}   r  r}   r  r}   )4r  Union[str, List[int]]r  r  r  r  r  r   rZ  r   r[  r   r\  r   r  r  r  r}   r  Optional[Union[str, List[str]]]r_  r   r`  r   r^  r   rY  r   r  r}   r+   r  ra  r   rb  r   rd  r   rc  r   rW   r  r|  r  rC  r  rf  r  r  Optional[Dict[int, float]]r  zSUnion[Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]])4r  r  r  r  r  r  r  r   rZ  r   r[  r   r\  r   r  r  r  r}   r  r  r_  r   r`  r   r^  r   rY  r   r  r}   r+   r  ra  r   rb  r   rd  r   rc  r   rW   r  r|  r  rC  r  rf  r  r  r  r  IUnion[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]])4r  r   r  r  r  r  r  r   rZ  r   r[  r   r\  r   r  r  r  r}   r  r  r_  r   r`  r   r^  r   rY  r   r  r}   r+   r  ra  r   rb  r   rd  r   rc  r   rW   r  r|  r  rC  r  rf  r  r  r  r  r  ):r;  z"List[ChatCompletionRequestMessage]r<  z&Optional[List[ChatCompletionFunction]]r=  z+Optional[ChatCompletionRequestFunctionCall]r>  z"Optional[List[ChatCompletionTool]]r?  z(Optional[ChatCompletionToolChoiceOption]r  r   rZ  r   rY  r   r[  r   r\  r   r  r}   r  r  r+   r  r@  z-Optional[ChatCompletionRequestResponseFormat]r  r  r`  r   r_  r   r^  r   ra  r   rb  r   rd  r   rc  r   rW   r  rC  r  rf  r  r  r  r  r  r  r  r  zQUnion[CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]])rO  r   r   r   )r  r_  )rV  r_  r  None)r  r   )r  r   )r  r   )r  r  )rP   )r)  z#Union[npt.NDArray[np.single], List]r  r   r  r  )r~  r  r  r  )NNautoN)r  r   r  r  r  zOptional[List]r  &Optional[Union[str, os.PathLike[str]]]r  zUnion[bool, Literal['auto']]r  r  r   r   r  z'Llama')7__name__
__module____qualname____firstlineno____doc__rz   r^   LLAMA_SPLIT_MODE_LAYERr   r   LLAMA_POOLING_TYPE_UNSPECIFIEDr   propertyr   rW   r   r   r  r  r	  r  r  r  r  r*  rk  ro  r  r  r  r+  r4  r7  rD  rP  rS  rW  r  r  r,   r  rh   rI   r   r   r   r2   r}  r  staticmethodr  r  classmethodr  __static_attributes__rA  re   rb   r   r   7   s   :! #::.2 JN00#')- 99%DD #!$!%"% $ #   %)#'"$#'#'!&%)OS1526 $ $ ugg
 g g g ,g g g g Hg g g  !g" #g$ !%g& ''g(
)g. /g0 1g2 3g4 5g6  7g8 9g: ;g< =g> ?g@ AgB CgD EgF #GgH !IgL MgN  OgR !SgT UgV !WgZ [g^ #_g` Magd /egh 0igl mgn ogr sgt ugR   ! ! / / / / S S 
 
 BG@@$(@:>@	@* ,0	

 )
 	

 

(#&N  ##&"%!! :>*.!WW W 	W
 W W W !W  W W W W W W 8W  (!Wv  ##&"%!! :>*.!#<< < 	<
 < < < !<  < < < < < < 8<  (!<" #<B  ##&"%!! :><@*.'JJ J 	J
 J J J J J !J  J J J J J  !J" 8#J$ :%J& ('J( 
7)JZ DH&
*&
3@&
	 &
V  "w$w w 	w
 wx !%$& "&02#&"% #"!!#<@:>*.155j	
%j	
 j	
 "	j	

 j	
 j	
 j	
 j	
  j	
 j	
 .j	
 !j	
  j	
 j	
 j	
  !j	
" #j	
$ %j	
& 'j	
( )j	
* +j	
, -j	
. :/j	
0 81j	
2 (3j	
4 /5j	
6
7j	
^ !%$& "&02#&"% #"!!#<@:>*.155_%_ _ "	_
 _ _ _ _  _ _ ._ !_  _ _ _  !_" #_$ %_& '_( )_* +_, -_. :/_0 81_2 (3_4 /5_6 
S7_H !%$& "&02#&"% #"!!#<@:>*.155Z
Z
 Z
 "	Z

 Z
 Z
 Z
 Z
  Z
 Z
 .Z
 !Z
  Z
 Z
 Z
  !Z
" #Z
$ %Z
& 'Z
( )Z
* +Z
, -Z
. :/Z
0 81Z
2 (3Z
4 /5Z
6 
S7Z
~ =AEI48@D 02"IM$("%#& #!!#:>*.15#'&*;e
4e
 :e
 C	e

 2e
 >e
 e
 e
 e
 e
 e
 e
 .e
 e
 Ge
  "!e
"  #e
$ !%e
& 'e
( )e
* +e
, -e
. /e
0 1e
2 83e
4 (5e
6 /7e
8 !9e
: $;e
<
=e
N B6
p
<A!$%$''&( AC#3#;>#	# #"   
 ,0<@?E<@|
|
  |
 )	|

 :|
 !=|
 :|
 |
 
|
 |
re   r   c                  6    \ rS rSr            SS jrSrg)r_  i?	  c                L    Xl         X l        X0l        X@l        XPl        X`l        g r\   )r   r   rX   rZ  r[  r+   )ra   r   r   rX   rZ  r[  r+   s          rb   r   LlamaState.__init__@	  s$     # & 0	re   )r   rZ  r[  rX   r   r+   N)r   r  r   r  rX   r   rZ  r`  r[  r   r+   r   )r  r  r  r  r   r  rA  re   rb   r_  r_  ?	  sA    ' ' 	
   re   r_  c                  *    \ rS rSr      SS jrSrg)r  iV	  c                (    U  H  nU" X5      nM     U$ r\   rA  )ra   r   r   	processors       rb   r7  LogitsProcessorList.__call__W	  s     Iy1F re   rA  Nr  r  r  r  r  r7  r  rA  re   rb   r  r  V	  s    -7M	re   r  c                  *    \ rS rSr      SS jrSrg)StoppingCriteriaListib	  c           	     N    [        U  Vs/ s H
  o3" X5      PM     sn5      $ s  snf r\   )any)ra   r   r)  r|  s       rb   r7  StoppingCriteriaList.__call__c	  s+     RVWRV=N%i8RVWXXWs   "rA  N)r   r  r)  r  r  r}   r  rA  re   rb   r  r  b	  s#    Y-Y7MY	Yre   r  c                  4    \ rS rSrSS jr      SS jrSrg)MinTokensLogitsProcessorii	  c                *    Xl         X l        S U l        g r\   )
min_tokensr   r  )ra   r  r   s      rb   r   !MinTokensLogitsProcessor.__init__j	  s    $"!re   c                    U R                   c  [        U5      U l         [        U5      U R                   -
  U R                  :  a  [        R                  * X R
                  '   U$ r\   )r  r   r  r   infr   )ra   r   r   s      rb   r7  !MinTokensLogitsProcessor.__call__o	  sN     %!$YDy>D...@&(ffWF>>"re   )r  r  r   N)r  r   r   r   r  )r  r  r  r  r   r7  r  rA  re   rb   r  r  i	  s$    "
-7M	re   r  )D
__future__r   r   r   r  r  r  r   r   r  r  r  rw   r   r   r   r   r   r   r	   r
   r   r   r   r   collectionsr   pathlibr   llama_typesllama_grammarr   llama_cacher   r   r   r   llama_tokenizerr   r   llama_cpp.llama_cppr^   llama_cpp.llama_chat_formatr   llama_cpp.llama_speculativer   numpyr   numpy.typingnptllama_cpp._internals
_internalsr   _loggerr   _utilsr   r   r_  NDArrayr   r   LogitsProcessorr  r}   StoppingCriteriar  r  rA  re   rb   <module>r     s2   " 	 
                 '  @ ' 7 7   (   *E$
 E$
PH $ [[3;;ryy12CKK		4JJ
$/  S[[13;;ryy3IJDPQ Y4 01 Y re   