mirror of
https://git.mia.jetzt/scrubber
synced 2025-01-25 12:21:28 -07:00
initial commit
This commit is contained in:
commit
81071e8fee
4 changed files with 304 additions and 0 deletions
159
1_graph.py
Normal file
159
1_graph.py
Normal file
|
@ -0,0 +1,159 @@
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from collections import namedtuple
|
||||||
|
from functools import cache
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import psycopg
|
||||||
|
|
||||||
|
try:
|
||||||
|
import progressbar2 as progressbar
|
||||||
|
except ImportError:
|
||||||
|
import progressbar
|
||||||
|
|
||||||
|
|
||||||
|
Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"])
|
||||||
|
Tree = namedtuple("Tree", ["id", "replies", "renotes"])
|
||||||
|
|
||||||
|
print("configuring")
|
||||||
|
config = {}
|
||||||
|
exec(Path("config.py").read_text(), config)
|
||||||
|
conn: psycopg.Connection = config["connect"]()
|
||||||
|
user_id: str = config["user_id"]
|
||||||
|
early_exit = config.get("early_exit")
|
||||||
|
|
||||||
|
|
||||||
|
print("fetching note ids", file=sys.stderr)
|
||||||
|
note_ids = set()
|
||||||
|
cur = conn.execute(
|
||||||
|
'select id from note where "userId" = %s and not ("renoteId" is not null and text is null)',
|
||||||
|
[user_id],
|
||||||
|
)
|
||||||
|
while rows := cur.fetchmany(0xFF):
|
||||||
|
for row in rows:
|
||||||
|
note_ids.add(row[0])
|
||||||
|
if early_exit and len(note_ids) > early_exit:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def get_note(id: str) -> Note:
|
||||||
|
return Note(
|
||||||
|
*conn.execute(
|
||||||
|
'select "renoteId", "replyId", "userId" from note where id = %s', [id]
|
||||||
|
).fetchone()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
roots = {}
|
||||||
|
trees = {}
|
||||||
|
|
||||||
|
|
||||||
|
def tree_init(id: str, seek: bool = True) -> Tree:
|
||||||
|
if tree := trees.get(id):
|
||||||
|
return tree
|
||||||
|
tree = Tree(id, [], [])
|
||||||
|
note = get_note(id)
|
||||||
|
if note.reply_id or note.renote_id:
|
||||||
|
if note.reply_id:
|
||||||
|
p_tree = tree_init(note.reply_id)
|
||||||
|
p_tree.replies.append(tree)
|
||||||
|
if note.renote_id:
|
||||||
|
r_tree = tree_init(note.renote_id, False)
|
||||||
|
r_tree.renotes.append(tree)
|
||||||
|
else:
|
||||||
|
roots[id] = tree
|
||||||
|
trees[id] = tree
|
||||||
|
return tree
|
||||||
|
|
||||||
|
|
||||||
|
def make_widgets(msg, trees, roots):
|
||||||
|
widgets = [
|
||||||
|
f"{msg} ",
|
||||||
|
progressbar.Percentage(),
|
||||||
|
" ",
|
||||||
|
progressbar.Bar(),
|
||||||
|
" ",
|
||||||
|
progressbar.SimpleProgress("%(value_s)s/%(max_value_s)s"),
|
||||||
|
" ",
|
||||||
|
]
|
||||||
|
if trees:
|
||||||
|
widgets += [progressbar.Variable("trees"), " "]
|
||||||
|
if roots:
|
||||||
|
widgets += [progressbar.Variable("roots"), " "]
|
||||||
|
widgets += [progressbar.ETA()]
|
||||||
|
return widgets
|
||||||
|
|
||||||
|
|
||||||
|
pb = progressbar.ProgressBar(
|
||||||
|
0,
|
||||||
|
len(note_ids),
|
||||||
|
widgets=make_widgets("building trees", True, True),
|
||||||
|
)
|
||||||
|
for note_id in note_ids:
|
||||||
|
tree_init(note_id)
|
||||||
|
pb.increment(trees=len(trees), roots=len(roots))
|
||||||
|
pb.finish()
|
||||||
|
|
||||||
|
|
||||||
|
def traverse(tree: Tree):
|
||||||
|
note = get_note(tree.id)
|
||||||
|
if note.user_id == user_id:
|
||||||
|
expand(tree)
|
||||||
|
else:
|
||||||
|
for child in tree.replies:
|
||||||
|
traverse(child)
|
||||||
|
|
||||||
|
|
||||||
|
def expand(tree: Tree):
|
||||||
|
for row in conn.execute(
|
||||||
|
"select id from note_replies(%s, 1, 1000)", [tree.id]
|
||||||
|
).fetchall():
|
||||||
|
if row[0] in trees:
|
||||||
|
continue
|
||||||
|
note = get_note(row[0])
|
||||||
|
new = Tree(row[0], [], [])
|
||||||
|
if note.reply_id == tree.id:
|
||||||
|
# is a reply
|
||||||
|
tree.replies.append(new)
|
||||||
|
trees[row[0]] = new
|
||||||
|
if note.renote_id == tree.id:
|
||||||
|
# is a renote
|
||||||
|
tree.renotes.append(new)
|
||||||
|
trees[row[0]] = new
|
||||||
|
for child in tree.replies:
|
||||||
|
expand(child)
|
||||||
|
|
||||||
|
|
||||||
|
roots_len = len(roots)
|
||||||
|
pb = progressbar.ProgressBar(
|
||||||
|
0, roots_len, widgets=make_widgets("expanding roots", True, False)
|
||||||
|
)
|
||||||
|
|
||||||
|
for root in roots.values():
|
||||||
|
traverse(root)
|
||||||
|
pb.increment(trees=len(trees))
|
||||||
|
pb.finish()
|
||||||
|
|
||||||
|
|
||||||
|
with Path("graph.db").open("w") as f:
|
||||||
|
pb = progressbar.ProgressBar(
|
||||||
|
0, len(trees), widgets=make_widgets("saving graph", False, False)
|
||||||
|
)
|
||||||
|
for key, tree in trees.items():
|
||||||
|
note = get_note(tree.id)
|
||||||
|
is_root = tree.id in roots
|
||||||
|
f.write(f"{tree.id}\t")
|
||||||
|
f.write(",".join((reply.id for reply in tree.replies)))
|
||||||
|
f.write(f"\t")
|
||||||
|
f.write(",".join((renote.id for renote in tree.renotes)))
|
||||||
|
f.write(f"\t")
|
||||||
|
flags = []
|
||||||
|
if tree.id in roots:
|
||||||
|
flags.append("root")
|
||||||
|
if note.user_id == user_id:
|
||||||
|
flags.append("self")
|
||||||
|
f.write(",".join(flags))
|
||||||
|
f.write(f"\n")
|
||||||
|
pb.increment()
|
||||||
|
pb.finish()
|
84
2_filter.py
Normal file
84
2_filter.py
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable, List
|
||||||
|
|
||||||
|
import psycopg
|
||||||
|
|
||||||
|
from ty import FilterableNote, Visibility
|
||||||
|
|
||||||
|
try:
|
||||||
|
import progressbar2 as progressbar
|
||||||
|
except ImportError:
|
||||||
|
import progressbar
|
||||||
|
|
||||||
|
|
||||||
|
print("configuring")
|
||||||
|
config = {}
|
||||||
|
exec(Path("config.py").read_text(), config)
|
||||||
|
conn: psycopg.Connection = config["connect"]()
|
||||||
|
criteria: Callable[[FilterableNote], bool] = config["criteria"]
|
||||||
|
|
||||||
|
intermediate = {}
|
||||||
|
|
||||||
|
print("parsing")
|
||||||
|
for line in Path("graph.db").read_text().splitlines():
|
||||||
|
id, replies, quotes, flags = line.split("\t")
|
||||||
|
intermediate[id] = {
|
||||||
|
"id": id,
|
||||||
|
"replies": replies.split(",") if len(replies) > 0 else [],
|
||||||
|
"quotes": quotes.split(",") if len(quotes) > 0 else [],
|
||||||
|
"flags": flags.split(",") if len(flags) > 0 else [],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def transform(entry: dict) -> FilterableNote:
|
||||||
|
note = conn.execute(
|
||||||
|
'select "createdAt", reactions, "renoteCount", visibility from note where id = %s',
|
||||||
|
[entry["id"]],
|
||||||
|
).fetchone()
|
||||||
|
if note is None:
|
||||||
|
return None # part of thread disappeared during processing
|
||||||
|
when, reactions, renotes, visibility = note
|
||||||
|
|
||||||
|
replies = [transform(intermediate[reply]) for reply in entry["replies"]]
|
||||||
|
quotes = [transform(intermediate[quote]) for quote in entry["quotes"]]
|
||||||
|
if None in replies or None in quotes:
|
||||||
|
return None # bubble up, buttercup
|
||||||
|
|
||||||
|
return FilterableNote(
|
||||||
|
entry["id"],
|
||||||
|
"self" in entry["flags"],
|
||||||
|
replies,
|
||||||
|
quotes,
|
||||||
|
when.astimezone(),
|
||||||
|
sum(reactions.values()),
|
||||||
|
renotes,
|
||||||
|
Visibility.from_db(visibility),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
root_count = 0
|
||||||
|
for entry in intermediate.values():
|
||||||
|
if "root" in entry["flags"]:
|
||||||
|
root_count += 1
|
||||||
|
|
||||||
|
|
||||||
|
pb = progressbar.ProgressBar(
|
||||||
|
0,
|
||||||
|
root_count,
|
||||||
|
prefix="processing ",
|
||||||
|
)
|
||||||
|
targets = []
|
||||||
|
for entry in intermediate.values():
|
||||||
|
if "root" not in entry["flags"]:
|
||||||
|
continue
|
||||||
|
transformed = transform(entry)
|
||||||
|
if transformed is None:
|
||||||
|
continue # we'll get to it next cycle
|
||||||
|
if criteria(transformed):
|
||||||
|
targets.append(entry["id"])
|
||||||
|
pb.increment()
|
||||||
|
pb.finish()
|
||||||
|
|
||||||
|
|
||||||
|
Path("filtered.list").write_text("\n".join(targets))
|
0
requirements.txt
Normal file
0
requirements.txt
Normal file
61
ty.py
Normal file
61
ty.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Callable
|
||||||
|
from datetime import datetime
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
class Visibility(Enum):
|
||||||
|
public = 1
|
||||||
|
unlisted = 2
|
||||||
|
followers = 3
|
||||||
|
direct = 4
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_db(cls, raw: str) -> "Visibility":
|
||||||
|
match raw:
|
||||||
|
case "public": return cls.public
|
||||||
|
case "home": return cls.unlisted
|
||||||
|
case "followers": return cls.followers
|
||||||
|
case "specified": return cls.direct
|
||||||
|
case _: raise ValueError(f"unknown visibility `{raw}`")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FilterableNote:
|
||||||
|
id: str
|
||||||
|
mine: bool
|
||||||
|
replies: List["FilterableNote"]
|
||||||
|
quotes: List["FilterableNote"]
|
||||||
|
when: datetime
|
||||||
|
reactions: int
|
||||||
|
renotes: int
|
||||||
|
visibility: Visibility
|
||||||
|
|
||||||
|
def thread(self) -> List["FilterableNote"]:
|
||||||
|
acc = []
|
||||||
|
for reply in self.replies:
|
||||||
|
acc += reply.thread()
|
||||||
|
for quote in self.quotes:
|
||||||
|
acc += quote.thread()
|
||||||
|
acc.append(self)
|
||||||
|
return acc
|
||||||
|
|
||||||
|
def thread_self(self) -> List["FilterableNote"]:
|
||||||
|
acc = []
|
||||||
|
for reply in self.replies:
|
||||||
|
acc += reply.thread_self()
|
||||||
|
for quote in self.quotes:
|
||||||
|
acc += quote.thread_self()
|
||||||
|
if self.mine:
|
||||||
|
acc.append(self)
|
||||||
|
return acc
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
"id": self.id,
|
||||||
|
"mine": self.mine,
|
||||||
|
"replies": [note.to_dict() for note in self.replies],
|
||||||
|
"quotes": [note.to_dict() for note in self.quotes],
|
||||||
|
"when": self.when.isoformat(),
|
||||||
|
"reactions": self.reactions,
|
||||||
|
"renotes": self.renotes,
|
||||||
|
}
|
Loading…
Reference in a new issue