Rag_basic/data/preprocessing.py

import re, html
import pandas as pd
from typing import Dict, Any

BR_RE = re.compile(r'<\s*br\s*/?>', flags=re.I)
TAG_RE = re.compile(r'<[^>]+>')

def clean_html(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # <br> -> \n, bỏ tags, unescape
    text = BR_RE.sub("\n", text)
    text = TAG_RE.sub("", text)
    text = html.unescape(text)
    # dọn khoảng trắng, gạch đầu dòng
    lines = [re.sub(r'^\s*[-•–]\s*', '', ln.strip()) for ln in text.splitlines()]
    # bỏ dòng trống
    lines = [ln for ln in lines if ln]
    return "\n".join(lines)

# --------- PARSER CHO SPECS ----------
def parse_product_specs(raw: str) -> Dict[str, Any]:
    text = clean_html(raw)
    specs: Dict[str, Any] = {}

    # Tách theo dòng dạng "Khóa: Giá trị"
    for ln in text.splitlines():
        if ":" in ln:
            k, v = ln.split(":", 1)
            k = k.strip().lower()
            v = v.strip()
            specs[k] = v
        else:
            # Dòng không có dấu :, giữ lại để regex sau nhận dạng
            specs.setdefault("_free", []).append(ln)

    # Ánh xạ các khóa thường gặp sang tên chuẩn
    alias = {
        "công nghệ màn hình": "screen_tech",
        "độ phân giải": "resolution",
        "kích thước màn hình": "screen_size",
        "hệ điều hành": "os",
        "vi xử lý": "chipset",
        "bộ nhớ trong": "storage",
        "ram": "ram",
        "mạng di động": "network",
        "số khe sim": "sim",
        "dung lượng pin": "battery",
    }
    norm: Dict[str, Any] = {}
    for k, v in list(specs.items()):
        if k in alias:
            norm[alias[k]] = v

    # ---- Regex nhận dạng nếu thiếu khóa/không chuẩn ----
    all_text = "\n".join([*text.splitlines(), *(specs.get("_free", []))]).lower()

    # screen size
    m = re.search(r'(\d+(?:\.\d+)?)\s*(inch|")', all_text)
    if m and "screen_size" not in norm:
        norm["screen_size"] = m.group(1) + " inch"

    # resolution
    m = re.search(r'(\d{3,4})\s*[x×]\s*(\d{3,4})', all_text)
    if m and "resolution" not in norm:
        norm["resolution"] = f"{m.group(1)}×{m.group(2)}"

    # RAM (lấy số lớn nhất trong dòng có 'ram' hoặc toàn văn)
    m = re.search(r'ram[^0-9]*(\d+)\s*gb', all_text) or re.search(r'(\d+)\s*\+\s*(\d+)\s*gb\s*ram', all_text)
    if m and "ram" not in norm:
        if len(m.groups()) == 2:
            norm["ram"] = f"{max(int(m.group(1)), int(m.group(2)))}GB"
        else:
            norm["ram"] = f"{m.group(1)}GB"

    # storage
    m = re.search(r'(?:bộ nhớ trong|rom)[^0-9]*(\d{2,4})\s*gb', all_text)
    if m and "storage" not in norm:
        norm["storage"] = f"{m.group(1)}GB"

    # battery
    m = re.search(r'(\d{3,5})\s*mAh', all_text, flags=re.I)
    if m and "battery" not in norm:
        norm["battery"] = f"{m.group(1)} mAh"

    # chipset (nếu có “unisoc|snapdragon|helio|dimensity|exynos|kirin” trên dòng “vi xử lý”)
    if "chipset" not in norm:
        m = re.search(r'(unisoc|snapdragon|helio|dimensity|exynos|kirin)[^\n]*', all_text, flags=re.I)
        if m:
            norm["chipset"] = m.group(0).strip()

    # OS
    if "os" not in norm:
        m = re.search(r'android\s*\d+\s*(?:[a-z0-9 .-]*)', all_text, flags=re.I)
        if m:
            norm["os"] = m.group(0).strip()

    # sim
    m = re.search(r'(\d+)\s*(?:nano\s*)?sim', all_text)
    if m and "sim" not in norm:
        norm["sim"] = f"{m.group(1)} SIM"

    # 5G flag
    norm["has_5g"] = bool(re.search(r'\b5g\b', all_text)) and not bool(re.search(r'không\s+hỗ\s+trợ\s+5g', all_text))

    return norm

# ---- Làm sạch promotion & build text cho embedding ----
def clean_promotion(raw: str) -> str:
    return clean_html(raw)

def build_embedding_text(row: pd.Series) -> str:
    parts = [str(row.get("title", ""))]
    if row.get("product_specs_clean"):
        parts.append(row["product_specs_clean"])
    if row.get("product_promotion_clean"):
        parts.append(row["product_promotion_clean"])
    return " | ".join([p for p in parts if p]).strip()

# ======= TÍCH HỢP VÀO DataProcessor =======
def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Làm sạch text
    df["product_specs_clean"] = df["product_specs"].map(clean_html)
    df["product_promotion_clean"] = df.get("product_promotion", "").map(clean_promotion)

    # Parse specs có cấu trúc
    parsed = df["product_specs_clean"].map(parse_product_specs)
    df["screen_size"]   = parsed.map(lambda d: d.get("screen_size"))
    df["resolution"]    = parsed.map(lambda d: d.get("resolution"))
    df["chipset"]       = parsed.map(lambda d: d.get("chipset"))
    df["ram"]           = parsed.map(lambda d: d.get("ram"))
    df["storage"]       = parsed.map(lambda d: d.get("storage"))
    df["battery"]       = parsed.map(lambda d: d.get("battery"))
    df["sim"]           = parsed.map(lambda d: d.get("sim"))
    df["os"]            = parsed.map(lambda d: d.get("os"))
    df["has_5g"]        = parsed.map(lambda d: d.get("has_5g"))

    # Trường text để embed (đủ ngữ nghĩa)
    df["embedding_text"] = df.apply(build_embedding_text, axis=1)
    return df
if __name__ == "__main__":
    # Ví dụ sử dụng
    df = pd.read_excel("data.xlsx")
    df_processed = process_dataframe(df)
    print(df_processed['embedding_text'].head(3))
    df_processed.to_excel("data_processed.xlsx", index=False)