mirror of
https://git.mia.jetzt/scrubber
synced 2025-01-10 11:11:54 -07:00
156 lines
4.6 KiB
Python
156 lines
4.6 KiB
Python
import json
|
|
import time
|
|
from http.client import HTTPResponse
|
|
from pathlib import Path
|
|
from shutil import copyfileobj
|
|
from urllib.request import urlopen
|
|
|
|
import brotli
|
|
import msgpack
|
|
import psycopg
|
|
|
|
from com import Visibility, eval_config, parse_graph, progressbar
|
|
|
|
config = eval_config()
|
|
conn: psycopg.Connection = config["connect"]()
|
|
|
|
graph = parse_graph()
|
|
print("reading filterlist")
|
|
filtered = Path("filtered.list").read_text().strip().splitlines()
|
|
filtered = list(map(lambda line: line.split(' ')[0], filtered))
|
|
|
|
collected_users = {}
|
|
def collect_user(id: str):
|
|
if id in collected_users:
|
|
return
|
|
time.sleep(0.001)
|
|
user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone()
|
|
if user is None:
|
|
return None
|
|
username, host, avatar_url = user
|
|
profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone()
|
|
description, fields = profile or ("", [])
|
|
|
|
output = {}
|
|
output["id"] = id
|
|
output["username"] = username
|
|
output["host"] = host
|
|
output["description"] = description
|
|
output["fields"] = fields
|
|
output["avatar_url"] = avatar_url
|
|
|
|
collected_users[id] = output
|
|
|
|
collected_notes = []
|
|
files_to_collect = []
|
|
def collect_note(id: str):
|
|
output = {}
|
|
output["id"] = id
|
|
|
|
time.sleep(0.001)
|
|
note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds", cw from note where id = %s', [id]).fetchone()
|
|
if note is None:
|
|
return None
|
|
text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids, cw = note
|
|
collect_user(user_id)
|
|
|
|
output["text"] = text
|
|
output["user_id"] = user_id
|
|
output["created_at"] = created_at.astimezone(tz=None).isoformat()
|
|
output["updated_at"] = None
|
|
if updated_at is not None:
|
|
output["updated_at"] = updated_at.astimezone(tz=None).isoformat()
|
|
output["reactions"] = reactions
|
|
output["renotes"] = renotes
|
|
output["visibility"] = Visibility.from_db(visibility).code()
|
|
output["cw"] = cw
|
|
|
|
node = graph[id]
|
|
replies = [collect_note(reply) for reply in node["replies"]]
|
|
replies = filter(lambda reply: reply is not None, replies)
|
|
quotes = [collect_note(quote) for quote in node["quotes"]]
|
|
quotes = filter(lambda quote: quote is not None, quotes)
|
|
|
|
output["attachments"] = []
|
|
for file_id in file_ids:
|
|
time.sleep(0.0005)
|
|
row = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone()
|
|
if row is None:
|
|
continue
|
|
name, type_, comment, url = row
|
|
attachment = {
|
|
"id": file_id,
|
|
"type": type_,
|
|
"comment": comment,
|
|
}
|
|
if "self" in node["flags"]: # archive own attachments
|
|
files_to_collect.append((file_id, url))
|
|
attachment["url"] = None
|
|
else:
|
|
attachment["url"] = url
|
|
|
|
output["replies"] = list(replies)
|
|
output["quotes"] = list(quotes)
|
|
|
|
if len(output["attachments"]) == 0: del output["attachments"]
|
|
if len(output["replies"]) == 0: del output["replies"]
|
|
if len(output["quotes"]) == 0: del output["quotes"]
|
|
|
|
return output
|
|
|
|
pb = progressbar.ProgressBar(
|
|
0,
|
|
len(filtered),
|
|
prefix="collecting data ",
|
|
)
|
|
for id in filtered:
|
|
note = collect_note(id)
|
|
collected_notes.append((id, note))
|
|
pb.increment()
|
|
pb.finish()
|
|
|
|
outdir = Path("out")
|
|
if not outdir.exists():
|
|
outdir.mkdir()
|
|
if not (outdir / "note").exists():
|
|
(outdir / "note").mkdir()
|
|
if not (outdir / "user").exists():
|
|
(outdir / "user").mkdir()
|
|
if not (outdir / "file").exists():
|
|
(outdir / "file").mkdir()
|
|
|
|
pb = progressbar.ProgressBar(
|
|
0,
|
|
len(collected_notes) + len(collected_users),
|
|
prefix="writing data ",
|
|
)
|
|
|
|
for id, note in collected_notes:
|
|
outfile = outdir / "note" / id[:3] / f"{id[3:]}.mpk.br"
|
|
outfile.parent.mkdir(exist_ok=True)
|
|
with outfile.open("wb") as f:
|
|
f.write(brotli.compress(msgpack.dumps(note)))
|
|
pb.increment()
|
|
|
|
for id, user in collected_users.items():
|
|
outfile = outdir / "user" / id[:2] / f"{id[2:]}.mpk.br"
|
|
outfile.parent.mkdir(exist_ok=True)
|
|
with outfile.open("wb") as f:
|
|
f.write(brotli.compress(msgpack.dumps(note)))
|
|
pb.increment()
|
|
pb.finish()
|
|
|
|
pb = progressbar.ProgressBar(
|
|
0,
|
|
len(files_to_collect),
|
|
prefix="downloading attachments ",
|
|
)
|
|
for (id, url) in files_to_collect:
|
|
outfile = outdir / "file" / id[:2] / id[2:]
|
|
outfile.parent.mkdir(exist_ok=True)
|
|
response: HTTPResponse = urlopen(url)
|
|
with outfile.open("wb") as f:
|
|
copyfileobj(response, f)
|
|
response.close()
|
|
pb.increment()
|
|
pb.finish()
|