简历信息提取系统,使用Pydantic模型定义数据结构,并通过大语言模型(如GPT)从非结构化的简历文本中提取结构化信息。以下是详细解析:
核心功能
数据建模:用Pydantic的
Resume类严格定义简历字段和校验规则。格式转换:自动统一日期格式(如
1990-05-15→05-15-1990)。大模型交互:通过Prompt工程让AI提取信息并返回标准JSON。
核心代码
from datetime import datetime, date
from typing import List, Optional
from pydantic import BaseModel, Field, field_validator, EmailStr, model_validator
# 定义这个pydantic模型是关键的关键
class Resume(BaseModel):
name: Optional[str] = Field(None, description="求职者姓名,如果没找到就置为空字符串")
city: Optional[str] = Field(None, description="求职者居住地,如果没找到就置为空字符串")
birthday: Optional[str] = Field(None, description="求职者生日,如果没找到就置为空字符串")
phone: Optional[str] = Field(None, description="求职者手机号,如果没找到就置为空字符串")
email: Optional[str] = Field(None, description="求职者邮箱,如果没找到就置为空字符串")
education: Optional[List[str]] = Field(None, description="求职者教育背景")
experience: Optional[List[str]] = Field(None, description="求职者工作或实习经历,如果没找到就置为空字符串")
project: Optional[List[str]] = Field(None, description="求职者项目经历,如果没找到就置为空字符串")
certificates: Optional[List[str]] = Field(None, description="求职者资格证书,如果没找到就置为空字符串")
@field_validator("birthday", mode="before")
def validate_and_convert_date(cls, raw_date):
if raw_date is None:
return None
if isinstance(raw_date, str):
# List of acceptable date formats
date_formats = ['%d-%m-%Y', '%Y-%m-%d', '%d/%m/%Y', '%m-%d-%Y']
for fmt in date_formats:
try:
# Attempt to parse the date string with the current format
parsed_date = datetime.strptime(raw_date, fmt).date()
# Return the date in MM-DD-YYYY format as a string
return parsed_date.strftime('%m-%d-%Y')
except ValueError:
continue # Try the next format
# If none of the formats match, raise an error
raise ValueError(
f"Invalid date format for 'consultation_date'. Expected one of: {', '.join(date_formats)}."
)
if isinstance(raw_date, date):
# Convert date object to MM-DD-YYYY format
return raw_date.strftime('%m-%d-%Y')
raise ValueError(
"Invalid type for 'consultation_date'. Must be a string or a date object."
)
class ResumeOpenAI:
def __init__(self):
self.resume_profile = Resume()
self.output_schema = self.resume_profile.model_json_schema()
self.template = """
You are an expert in analyzing resumes. Use the following JSON schema to extract relevant information:
```json
{output_schema}
```json
Extract the information from the following document and provide a structured JSON response strictly adhering to the schema above.
Please remove any ```json ``` characters from the output. Do not make up any information. If a field cannot be extracted, mark it as `n/a`.
Document:
----------------
{resume_content}
----------------
"""
def create_prompt(self, output_schema, resume_content):
return self.template.format(
output_schema=output_schema,
resume_content=resume_content
)
def run(self, resume_content):
try:
response = client.chat.completions.create(
model=chat_model,
# 不是所有模型都支持response_format,要看一下调用的模型是否支持这个参数
# 千问、智谱的模型一般支持
response_format={ "type": "json_object" },
messages=[
{"role": "system", "content": "你是一位专业的简历信息提取专家。"},
{"role": "user", "content": self.create_prompt(self.output_schema, resume_content)}
],
)
result = response.choices[0].message.content
except Exception as e:
print(f"Error occurred: {e}")
return result
resume_openai = ResumeOpenAI()
