ARTICLE AD BOX
import pdfplumber
import json
from ctransformers import AutoModelForCausalLM
class Extractor:
def __init__(self, model_name="Mistral-7B-Instruct-v0.3-Q4_K_M.gguf"):
self.text = ""
self.results = {"objects": []}
self.model_name = model_name
self.extracted_names = set()
model_path = "models/Mistral-7B-Instruct-v0.3-Q4_K_M.gguf"
self.llm = AutoModelForCausalLM.from_pretrained(
model_path_or_repo_id=model_path,
model_type="mistral",
gpu_layers=0,
context_length=1536,
)
print("Model loaded!")
def _extract_text(self, pdf_path):
print(f"Extracting text from {pdf_path}...")
try:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
t = page.extract_text()
if t: self.text += t + "\n"
return self.text
except FileNotFoundError:
print("Error: PDF file not found.")
return ""
def _build_prompt(self, num_objects, text_chunk):
avoid_list = ", ".join(list(self.extracted_names)[-20:])
return f"""
Extract {num_objects} UNIQUE SINGULAR historical objects from the text that can be picked up and held.
FOCUS ON: Tools, weapons, armor, shields, jewelry, household items, ceremonial objects.
STRICTLY AVOID:
- Coins, tablets, manuscripts, or anything with inscriptions/text
- Abstract items (concepts, ideas, events)
- Architectural elements (floors, pavements, columns, walls, buildings)
- Large immovable objects (statues over human size, altars, monuments)
- Natural objects (rocks, plants, terrain features)
Each object MUST be a singular item that a person could carry.
ALREADY EXTRACTED (DO NOT REPEAT): {avoid_list}
Return VALID JSON only:
{{
"objects": [
{{
"name": "Object Name",
"context": "Place here a brief historical context of the object, usage and any interesting facts, do not hallucinate."
}}
]
}}
Text:
{text_chunk}
"""
def _query_llm(self, prompt):
try:
print("check a")
response = self.llm(
prompt,
max_new_tokens=256,
temperature=0.2,
top_p=0.9,
)
print("check 1")
response_text = response.strip()
print("check 2")
start_idx = response_text.find('{')
end_idx = response_text.rfind('}') + 1
print("check 1")
if start_idx != -1 and end_idx > start_idx:
json_str = response_text[start_idx:end_idx]
return json.loads(json_str)
else:
print(f"No JSON found in response: {response_text[:200]}")
return {"objects": []}
except Exception as e:
print(f"LLM Error: {e}")
# raise RuntimeError("LLM crashed.")
return {"objects": []}
def _extract_objects(self, num_objects=10):
if not self.text: return
chunk_size = 512
chunks = [self.text[i:i + chunk_size] for i in range(0, len(self.text), chunk_size)]
objs_per_chunk = max(1, round(num_objects / len(chunks))) if chunks else 1
print(f"Processing {len(chunks)} chunks ({chunk_size} chars each)...")
for i, chunk in enumerate(chunks):
if len(self.results["objects"]) >= num_objects: break
print(f"Chunk {i+1}/{len(chunks)}: Extracting ~{objs_per_chunk} objects...")
prompt = self._build_prompt(objs_per_chunk, chunk)
batch_results = self._query_llm(prompt)
if "objects" in batch_results:
for obj in batch_results["objects"]:
if obj["name"].lower() not in self.extracted_names:
self.results["objects"].append(obj)
self.extracted_names.add(obj["name"].lower())
print(f"Extracted: {obj['name']}")
return self.results
def _save_results(self, path="extraction_results.json"):
with open(path, "w", encoding="utf8") as f:
json.dump(self.results, f, indent=2)
print(f"Successfully saved {len(self.results['objects'])} objects to {path}")
def extract(self, num_objects=10, pdf_path=""):
text = self._extract_text(pdf_path)
if text:
self._extract_objects(num_objects)
self._save_results()
return self.results
if __name__ == "__main__":
extractor = Extractor()
extractor.extract(num_objects=10, pdf_path="data/history-roman-empire.pdf")
I can't lie, I'm pretty sure some of this mess is slop coded. When running the python file, the following output occurs:
Model loaded! Extracting text from data/history-roman-empire.pdf... Processing 17 chunks (512 chars each)... Chunk 1/17: Extracting ~1 objects... check a LLM Error: exception: access violation reading 0x000001FA4455E098 Chunk 2/17: Extracting ~1 objects... check aI'm aware the second chunk is trying to use a corrupted llm state, and so it crashes out. But I'm not sure what is causing the LLM Error, or how to fix it (GPT can't do it, redeem yourself stack overflowers).
I'd also understand if you don't have any idea what the hell is happening, I'm being forced to use LLMs in this project for uni and really CBA with this module. Let's hope the professors don't see this post, if you're my professor, I was hacked.
