mirror of
https://git.mia.jetzt/scrubber
synced 2025-01-10 14:51:53 -07:00
initial commit
This commit is contained in:
commit
81071e8fee
4 changed files with 304 additions and 0 deletions
159
1_graph.py
Normal file
159
1_graph.py
Normal file
|
@ -0,0 +1,159 @@
|
|||
import json
|
||||
import sys
|
||||
from collections import namedtuple
|
||||
from functools import cache
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg
|
||||
|
||||
try:
|
||||
import progressbar2 as progressbar
|
||||
except ImportError:
|
||||
import progressbar
|
||||
|
||||
|
||||
Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"])
|
||||
Tree = namedtuple("Tree", ["id", "replies", "renotes"])
|
||||
|
||||
print("configuring")
|
||||
config = {}
|
||||
exec(Path("config.py").read_text(), config)
|
||||
conn: psycopg.Connection = config["connect"]()
|
||||
user_id: str = config["user_id"]
|
||||
early_exit = config.get("early_exit")
|
||||
|
||||
|
||||
print("fetching note ids", file=sys.stderr)
|
||||
note_ids = set()
|
||||
cur = conn.execute(
|
||||
'select id from note where "userId" = %s and not ("renoteId" is not null and text is null)',
|
||||
[user_id],
|
||||
)
|
||||
while rows := cur.fetchmany(0xFF):
|
||||
for row in rows:
|
||||
note_ids.add(row[0])
|
||||
if early_exit and len(note_ids) > early_exit:
|
||||
break
|
||||
|
||||
|
||||
@cache
|
||||
def get_note(id: str) -> Note:
|
||||
return Note(
|
||||
*conn.execute(
|
||||
'select "renoteId", "replyId", "userId" from note where id = %s', [id]
|
||||
).fetchone()
|
||||
)
|
||||
|
||||
|
||||
roots = {}
|
||||
trees = {}
|
||||
|
||||
|
||||
def tree_init(id: str, seek: bool = True) -> Tree:
|
||||
if tree := trees.get(id):
|
||||
return tree
|
||||
tree = Tree(id, [], [])
|
||||
note = get_note(id)
|
||||
if note.reply_id or note.renote_id:
|
||||
if note.reply_id:
|
||||
p_tree = tree_init(note.reply_id)
|
||||
p_tree.replies.append(tree)
|
||||
if note.renote_id:
|
||||
r_tree = tree_init(note.renote_id, False)
|
||||
r_tree.renotes.append(tree)
|
||||
else:
|
||||
roots[id] = tree
|
||||
trees[id] = tree
|
||||
return tree
|
||||
|
||||
|
||||
def make_widgets(msg, trees, roots):
|
||||
widgets = [
|
||||
f"{msg} ",
|
||||
progressbar.Percentage(),
|
||||
" ",
|
||||
progressbar.Bar(),
|
||||
" ",
|
||||
progressbar.SimpleProgress("%(value_s)s/%(max_value_s)s"),
|
||||
" ",
|
||||
]
|
||||
if trees:
|
||||
widgets += [progressbar.Variable("trees"), " "]
|
||||
if roots:
|
||||
widgets += [progressbar.Variable("roots"), " "]
|
||||
widgets += [progressbar.ETA()]
|
||||
return widgets
|
||||
|
||||
|
||||
pb = progressbar.ProgressBar(
|
||||
0,
|
||||
len(note_ids),
|
||||
widgets=make_widgets("building trees", True, True),
|
||||
)
|
||||
for note_id in note_ids:
|
||||
tree_init(note_id)
|
||||
pb.increment(trees=len(trees), roots=len(roots))
|
||||
pb.finish()
|
||||
|
||||
|
||||
def traverse(tree: Tree):
|
||||
note = get_note(tree.id)
|
||||
if note.user_id == user_id:
|
||||
expand(tree)
|
||||
else:
|
||||
for child in tree.replies:
|
||||
traverse(child)
|
||||
|
||||
|
||||
def expand(tree: Tree):
|
||||
for row in conn.execute(
|
||||
"select id from note_replies(%s, 1, 1000)", [tree.id]
|
||||
).fetchall():
|
||||
if row[0] in trees:
|
||||
continue
|
||||
note = get_note(row[0])
|
||||
new = Tree(row[0], [], [])
|
||||
if note.reply_id == tree.id:
|
||||
# is a reply
|
||||
tree.replies.append(new)
|
||||
trees[row[0]] = new
|
||||
if note.renote_id == tree.id:
|
||||
# is a renote
|
||||
tree.renotes.append(new)
|
||||
trees[row[0]] = new
|
||||
for child in tree.replies:
|
||||
expand(child)
|
||||
|
||||
|
||||
roots_len = len(roots)
|
||||
pb = progressbar.ProgressBar(
|
||||
0, roots_len, widgets=make_widgets("expanding roots", True, False)
|
||||
)
|
||||
|
||||
for root in roots.values():
|
||||
traverse(root)
|
||||
pb.increment(trees=len(trees))
|
||||
pb.finish()
|
||||
|
||||
|
||||
with Path("graph.db").open("w") as f:
|
||||
pb = progressbar.ProgressBar(
|
||||
0, len(trees), widgets=make_widgets("saving graph", False, False)
|
||||
)
|
||||
for key, tree in trees.items():
|
||||
note = get_note(tree.id)
|
||||
is_root = tree.id in roots
|
||||
f.write(f"{tree.id}\t")
|
||||
f.write(",".join((reply.id for reply in tree.replies)))
|
||||
f.write(f"\t")
|
||||
f.write(",".join((renote.id for renote in tree.renotes)))
|
||||
f.write(f"\t")
|
||||
flags = []
|
||||
if tree.id in roots:
|
||||
flags.append("root")
|
||||
if note.user_id == user_id:
|
||||
flags.append("self")
|
||||
f.write(",".join(flags))
|
||||
f.write(f"\n")
|
||||
pb.increment()
|
||||
pb.finish()
|
84
2_filter.py
Normal file
84
2_filter.py
Normal file
|
@ -0,0 +1,84 @@
|
|||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Callable, List
|
||||
|
||||
import psycopg
|
||||
|
||||
from ty import FilterableNote, Visibility
|
||||
|
||||
try:
|
||||
import progressbar2 as progressbar
|
||||
except ImportError:
|
||||
import progressbar
|
||||
|
||||
|
||||
print("configuring")
|
||||
config = {}
|
||||
exec(Path("config.py").read_text(), config)
|
||||
conn: psycopg.Connection = config["connect"]()
|
||||
criteria: Callable[[FilterableNote], bool] = config["criteria"]
|
||||
|
||||
intermediate = {}
|
||||
|
||||
print("parsing")
|
||||
for line in Path("graph.db").read_text().splitlines():
|
||||
id, replies, quotes, flags = line.split("\t")
|
||||
intermediate[id] = {
|
||||
"id": id,
|
||||
"replies": replies.split(",") if len(replies) > 0 else [],
|
||||
"quotes": quotes.split(",") if len(quotes) > 0 else [],
|
||||
"flags": flags.split(",") if len(flags) > 0 else [],
|
||||
}
|
||||
|
||||
|
||||
def transform(entry: dict) -> FilterableNote:
|
||||
note = conn.execute(
|
||||
'select "createdAt", reactions, "renoteCount", visibility from note where id = %s',
|
||||
[entry["id"]],
|
||||
).fetchone()
|
||||
if note is None:
|
||||
return None # part of thread disappeared during processing
|
||||
when, reactions, renotes, visibility = note
|
||||
|
||||
replies = [transform(intermediate[reply]) for reply in entry["replies"]]
|
||||
quotes = [transform(intermediate[quote]) for quote in entry["quotes"]]
|
||||
if None in replies or None in quotes:
|
||||
return None # bubble up, buttercup
|
||||
|
||||
return FilterableNote(
|
||||
entry["id"],
|
||||
"self" in entry["flags"],
|
||||
replies,
|
||||
quotes,
|
||||
when.astimezone(),
|
||||
sum(reactions.values()),
|
||||
renotes,
|
||||
Visibility.from_db(visibility),
|
||||
)
|
||||
|
||||
|
||||
root_count = 0
|
||||
for entry in intermediate.values():
|
||||
if "root" in entry["flags"]:
|
||||
root_count += 1
|
||||
|
||||
|
||||
pb = progressbar.ProgressBar(
|
||||
0,
|
||||
root_count,
|
||||
prefix="processing ",
|
||||
)
|
||||
targets = []
|
||||
for entry in intermediate.values():
|
||||
if "root" not in entry["flags"]:
|
||||
continue
|
||||
transformed = transform(entry)
|
||||
if transformed is None:
|
||||
continue # we'll get to it next cycle
|
||||
if criteria(transformed):
|
||||
targets.append(entry["id"])
|
||||
pb.increment()
|
||||
pb.finish()
|
||||
|
||||
|
||||
Path("filtered.list").write_text("\n".join(targets))
|
0
requirements.txt
Normal file
0
requirements.txt
Normal file
61
ty.py
Normal file
61
ty.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import List, Callable
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
class Visibility(Enum):
|
||||
public = 1
|
||||
unlisted = 2
|
||||
followers = 3
|
||||
direct = 4
|
||||
|
||||
@classmethod
|
||||
def from_db(cls, raw: str) -> "Visibility":
|
||||
match raw:
|
||||
case "public": return cls.public
|
||||
case "home": return cls.unlisted
|
||||
case "followers": return cls.followers
|
||||
case "specified": return cls.direct
|
||||
case _: raise ValueError(f"unknown visibility `{raw}`")
|
||||
|
||||
|
||||
@dataclass
|
||||
class FilterableNote:
|
||||
id: str
|
||||
mine: bool
|
||||
replies: List["FilterableNote"]
|
||||
quotes: List["FilterableNote"]
|
||||
when: datetime
|
||||
reactions: int
|
||||
renotes: int
|
||||
visibility: Visibility
|
||||
|
||||
def thread(self) -> List["FilterableNote"]:
|
||||
acc = []
|
||||
for reply in self.replies:
|
||||
acc += reply.thread()
|
||||
for quote in self.quotes:
|
||||
acc += quote.thread()
|
||||
acc.append(self)
|
||||
return acc
|
||||
|
||||
def thread_self(self) -> List["FilterableNote"]:
|
||||
acc = []
|
||||
for reply in self.replies:
|
||||
acc += reply.thread_self()
|
||||
for quote in self.quotes:
|
||||
acc += quote.thread_self()
|
||||
if self.mine:
|
||||
acc.append(self)
|
||||
return acc
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"id": self.id,
|
||||
"mine": self.mine,
|
||||
"replies": [note.to_dict() for note in self.replies],
|
||||
"quotes": [note.to_dict() for note in self.quotes],
|
||||
"when": self.when.isoformat(),
|
||||
"reactions": self.reactions,
|
||||
"renotes": self.renotes,
|
||||
}
|
Loading…
Reference in a new issue