From 71dbf5979632161df7b98ed475bad4795c51fb28 Mon Sep 17 00:00:00 2001
From: io <gie9ohbeixah@paperboats.net>
Date: Fri, 11 Jun 2021 21:29:51 +0000
Subject: [PATCH] add ability to ignore CWs

---
 README.md    |  2 ++
 functions.py |  5 +++--
 main.py      | 43 ++++---------------------------------------
 3 files changed, 9 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index a4c4d86..5ac8ba6 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ This version makes quite a few changes from [the original](https://github.com/Je
 - Non-Markov stuff
 - Stores toots in a sqlite database rather than a text file
   - Doesn't unnecessarily redownload all toots every time
+- Ability to ignore specific CWs
 
 ## FediBooks
 Before you use mstdn-ebooks to create your own ebooks bot, I recommend checking out [FediBooks](https://fedibooks.com). Compared to mstdn-ebooks, FediBooks offers a few advantages:
@@ -54,6 +55,7 @@ Configuring mstdn-ebooks is accomplished by editing `config.json`. If you want t
 | cw                       | null                                    | The content warning (aka subject) mstdn-ebooks will apply to non-error posts.                                                                                                                                                                                                           |
 | instance_blacklist       | ["bofa.lol", "witches.town", "knzk.me"] | If your bot is following someone from a blacklisted instance, it will skip over them and not download their posts. This is useful for ensuring that mstdn-ebooks doesn't waste time trying to download posts from dead instances, without you having to unfollow the user(s) from them. |
 | learn_from_cw            | false                                   | If true, mstdn-ebooks will learn from CW'd posts.                                                                                                                                                                                                                                       |
+| ignored_cws              | []                                      | If `learn_from_cw` is true, do not learn from posts with these CWs.
 | mention_handling         | 1                                       | 0: Never use mentions. 1: Only generate fake mentions in the middle of posts, never at the start. 2: Use mentions as normal (old behaviour).                                                                                                                                            |
 | max_thread_length        | 15                                      | The maximum number of bot posts in a thread before it stops replying. A thread can be 10 or 10000 posts long, but the bot will stop after it has posted `max_thread_length` times.                                                                                                      |
 | strip_paired_punctuation | false                                   | If true, mstdn-ebooks will remove punctuation that commonly appears in pairs, like " and (). This avoids the issue of posts that open a bracket (or quote) without closing it.                                                                                                          |
diff --git a/functions.py b/functions.py
index d9e38cb..36cbfdb 100755
--- a/functions.py
+++ b/functions.py
@@ -19,9 +19,10 @@ def make_sentence(output, cfg):
 	db.text_factory = str
 	c = db.cursor()
 	if cfg['learn_from_cw']:
-		toots = c.execute("SELECT content FROM `toots` ORDER BY RANDOM() LIMIT 10000").fetchall()
+		ignored_cws_query_params = "(" + ",".join("?" * len(cfg["ignored_cws"])) + ")"
+		toots = c.execute(f"SELECT content FROM `toots` WHERE cw NOT IN {ignored_cws_query_params} ORDER BY RANDOM() LIMIT 10000", cfg["ignored_cws"]).fetchall()
 	else:
-		toots = c.execute("SELECT content FROM `toots` WHERE cw = 0 ORDER BY RANDOM() LIMIT 10000").fetchall()
+		toots = c.execute("SELECT content FROM `toots` WHERE cw IS NULL ORDER BY RANDOM() LIMIT 10000").fetchall()
 
 	if len(toots) == 0:
 		output.send("Database is empty! Try running main.py.")
diff --git a/main.py b/main.py
index d67d377..eb2fb58 100755
--- a/main.py
+++ b/main.py
@@ -31,7 +31,8 @@ cfg = {
 	"length_lower_limit": 5,
 	"length_upper_limit": 50,
 	"overlap_ratio_enabled": False,
-	"overlap_ratio": 0.7
+	"overlap_ratio": 0.7,
+	"ignored_cws": [],
 }
 
 try:
@@ -94,46 +95,10 @@ following = client.account_following(me.id)
 db = sqlite3.connect("toots.db")
 db.text_factory = str
 c = db.cursor()
-c.execute("CREATE TABLE IF NOT EXISTS `toots` (sortid INTEGER UNIQUE PRIMARY KEY AUTOINCREMENT, id VARCHAR NOT NULL, cw INT NOT NULL DEFAULT 0, userid VARCHAR NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL)")
+c.execute("CREATE TABLE IF NOT EXISTS `toots` (sortid INTEGER UNIQUE PRIMARY KEY AUTOINCREMENT, id VARCHAR NOT NULL, cw VARCHAR, userid VARCHAR NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL)")
 c.execute("CREATE TRIGGER IF NOT EXISTS `dedup` AFTER INSERT ON toots FOR EACH ROW BEGIN DELETE FROM toots WHERE rowid NOT IN (SELECT MIN(sortid) FROM toots GROUP BY uri ); END; ")
 db.commit()
 
-tableinfo = c.execute("PRAGMA table_info(`toots`)").fetchall()
-found = False
-columns = []
-for entry in tableinfo:
-	if entry[1] == "sortid":
-		found = True
-		break
-	columns.append(entry[1])
-
-if not found:
-	print("Migrating to new database format. Please wait...")
-	print("WARNING: If any of the accounts your bot is following are Pleroma users, please delete toots.db and run main.py again to create it anew.")
-	try:
-		c.execute("DROP TABLE `toots_temp`")
-	except:
-		pass
-
-	c.execute("CREATE TABLE `toots_temp` (sortid INTEGER UNIQUE PRIMARY KEY AUTOINCREMENT, id VARCHAR NOT NULL, cw INT NOT NULL DEFAULT 0, userid VARCHAR NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL)")
-	for f in following:
-		user_toots = c.execute("SELECT * FROM `toots` WHERE userid LIKE ? ORDER BY id", (f.id,)).fetchall()
-		if user_toots is None:
-			continue
-
-		if columns[-1] == "cw":
-			for toot in user_toots:
-				c.execute("INSERT INTO `toots_temp` (id, userid, uri, content, cw) VALUES (?, ?, ?, ?, ?)", toot)
-		else:
-			for toot in user_toots:
-				c.execute("INSERT INTO `toots_temp` (id, cw, userid, uri, content) VALUES (?, ?, ?, ?, ?)", toot)
-
-	c.execute("DROP TABLE `toots`")
-	c.execute("ALTER TABLE `toots_temp` RENAME TO `toots`")
-	c.execute("CREATE TRIGGER IF NOT EXISTS `dedup` AFTER INSERT ON toots FOR EACH ROW BEGIN DELETE FROM toots WHERE rowid NOT IN (SELECT MIN(sortid) FROM toots GROUP BY uri ); END; ")
-
-db.commit()
-
 
 def handleCtrlC(signal, frame):
 	print("\nPREMATURE EVACUATION - Saving chunks")
@@ -155,7 +120,7 @@ def insert_toot(oii, acc, post, cursor):  # extracted to prevent duplication
 	pid = patterns["pid"].search(oii['object']['id']).group(0)
 	cursor.execute("REPLACE INTO toots (id, cw, userid, uri, content) VALUES (?, ?, ?, ?, ?)", (
 		pid,
-		1 if (oii['object']['summary'] is not None and oii['object']['summary'] != "") else 0,
+		oii['object']['summary'] or None,
 		acc.id,
 		oii['object']['id'],
 		post