Open
Description
Hello ,
I have used llama 3.1 8B model and infrencing through Exllama with Formatron for Structured Output .But it is working well for Literal data types but for string output and list of string it is not working . The results are blank.
Sharing two classes for which I am not able to get the output
import json
from formatron.schemas.pydantic import ClassSchema
class CallSummaryPromptFormat(ClassSchema):
conversation_summary: str = Field(max_length=250*10)
class AreasOfImprovementPromptFormat(ClassSchema):
suggestion_type: str = Field(max_length=8 * 10)
explain_improvement_suggestion: str = Field(max_length=80 * 10)
# Use Case - Areas of Improvement
class AgentAreasOfImprovementPromptFormat(ClassSchema):
agent_areas_of_improvement: conlist(AreasOfImprovementPromptFormat, max_length=5)
Below I am sharing the code implementation
class MihupExllamaLLM:
def __init__(self):
model_dir = "/app/Llama-3.1-8B-Instruct-exl2"
# model_dir = "/app/mlabonne_NeuralDaredevil-8B-abliterated-5_0bpw_exl2"
config = ExLlamaV2Config(model_dir)
# config.fasttensors = True
self.model = ExLlamaV2(config)
# ExLlamaV2Cache
self.cache = ExLlamaV2Cache_Q4(self.model, max_seq_len=256 * 96, lazy=True) # 32768 - 8200 MB # 24576 - 7900 MB
self.model.load_autosplit(self.cache, progress=True)
print("Loading tokenizer...")
self.tokenizer = ExLlamaV2Tokenizer(config)
self.tokenizer_data = build_token_enforcer_tokenizer_data(self.tokenizer)
self.generator = ExLlamaV2DynamicGenerator(
model=self.model,
cache=self.cache,
tokenizer=self.tokenizer,
)
self.gen_settings = ExLlamaV2Sampler.Settings(
temperature=0.0, # Set to 0 for deterministic output
top_k = 1 , # Only consider the most likely token
top_p = 1.0 , # No nucleus sampling
token_repetition_penalty=1.0 # No repetition penalty
)
self.generator.warmup()
def run_mihup_llm_inference(self, call_transcript: str, prompt_tuples: List[Tuple]) -> List[Dict]:
self.cache.reset()
common_transcript = format_transcript_text(call_transcript)
prompts = []
filters = []
use_case_ids = []
for upper_tuple in prompt_tuples:
use_case_id = upper_tuple[1]
use_case_ids.append(use_case_id)
p = upper_tuple[0]
prompt_str = p[0]
# print(f"use_case_id : {use_case_id}, prompt : {prompt_str}")
prompt_question_combined = format_llama3_prompt(mihup_system_prompt, common_transcript + prompt_str)
prompts.append(prompt_question_combined)
filter_schema = p[2]
formatter = FormatterBuilder()
print("before appending", formatter)
formatter.append_line(f"{formatter.json(filter_schema, capture_name='json')}")
print("after appending",formatter)
filters.append([
create_formatter_filter(self.model, self.tokenizer, formatter),
])
outputs = self.generator.generate(
prompt=prompts,
filters=filters,
filter_prefer_eos=True,
max_new_tokens=2048,
add_bos=True,
stop_conditions=[self.tokenizer.eos_token_id],
gen_settings=self.gen_settings,
completion_only=True,
encode_special_tokens=True,
)
print("Output is",outputs)
final_output = []
use_case_ids_to_be_considered = []
for i in range(len(outputs)):
try:
output_json = None
output_json = json.loads(outputs[i])
final_output.append(output_json)
use_case_ids_to_be_considered.append(use_case_ids[i])
except ValueError as e:
print("error: {0} , use_case_id :{1}".format(outputs[i], use_case_ids[i]))
use_case_id_key = "use_case_id"
for idx in range(len(final_output)):
final_output[idx][use_case_id_key] = use_case_ids_to_be_considered[idx]
return final_output
Metadata
Metadata
Assignees
Labels
No labels