import os
import re
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://pointcloud.ucsd.edu/OH3D/"
OLD_PREFIX = "https://pointcloud.ucsd.edu/OH3D/pointclouds"
NEW_PREFIX = "https://pointcloud.ucsd.edu/archive_temp/oh3d_store/oh3d-vis/datasets"


def get_html_file_list():
    res = requests.get(BASE_URL)
    res.raise_for_status()

    soup = BeautifulSoup(res.text, "html.parser")

    return [
        link.get("href")
        for link in soup.find_all("a")
        if link.get("href", "").endswith(".html")
    ]


def download_html(filename):
    url = BASE_URL + filename
    print("Downloading:", url)
    res = requests.get(url)
    res.raise_for_status()
    return res.text


def clean_block(block):
    """
    Perform the following transforms inside a Potree.loadPointCloud block:
    - Replace URL prefix
    - Insert /potree/ after DOI folder
    - Remove lines starting with:
        window.pointcloudMap
        window.pointclouds.push
    - Change e.pointcloud.visible = false → e.pointcloud.visible = true
    """

    # Step 1: Replace the old root URL with the archive URL
    block = block.replace(OLD_PREFIX, NEW_PREFIX)

    # Step 2: Insert /potree/ after the DOI folder
    block = re.sub(
        r"(oh3d-vis/datasets/[^/]+)/",
        r"\1/potree/",
        block
    )

    # Step 3: Flip visibility flags
    block = re.sub(
        r"e\.pointcloud\.visible\s*=\s*false\s*;?",
        "e.pointcloud.visible = true;",
        block
    )

    # Step 4: Remove unwanted lines
    cleaned_lines = []
    for line in block.splitlines():
        stripped = line.lstrip()

        if stripped.startswith("window.pointcloudMap"):
            continue
        if stripped.startswith("window.pointclouds.push"):
            continue

        cleaned_lines.append(line)

    return "\n".join(cleaned_lines).rstrip()




def extract_full_potree_calls(text):
    """Balanced-parenthesis extraction of full Potree.loadPointCloud blocks."""

    calls = []
    start_indices = [m.start() for m in re.finditer(r"Potree\.loadPointCloud\s*\(", text)]

    for start in start_indices:
        i = start
        while text[i] != '(':
            i += 1

        paren_count = 1
        i += 1
        end = i

        while end < len(text) and paren_count > 0:
            if text[end] == '(':
                paren_count += 1
            elif text[end] == ')':
                paren_count -= 1
            end += 1

        while end < len(text) and text[end] not in [';', '\n']:
            end += 1
        if end < len(text) and text[end] == ';':
            end += 1

        raw_block = text[start:end].strip()

        # Clean the block (URL conversion + removing unwanted lines)
        cleaned_block = clean_block(raw_block)

        calls.append(cleaned_block)

    return calls


def ensure_directory(doi):
    path = os.path.join(doi, "potree")
    os.makedirs(path, exist_ok=True)
    return os.path.join(path, "config.js")


def write_config(doi, calls):
    out_path = ensure_directory(doi)
    print(f"Writing {len(calls)} calls → {out_path}")

    with open(out_path, "w", encoding="utf-8") as f:
        f.write("// Auto-generated Potree config for DOI {}\n\n".format(doi))
        for call in calls:
            f.write(call + "\n\n")


def extract_doi_from_filename(filename):
    return filename.replace(".html", "")


def main():
    html_files = get_html_file_list()

    for filename in html_files:
        doi = extract_doi_from_filename(filename)
        html = download_html(filename)

        potree_calls = extract_full_potree_calls(html)

        if not potree_calls:
            print(f"No calls in {filename}")
            continue

        write_config(doi, potree_calls)

    print("Done!")


if __name__ == "__main__":
    main()