Edit filter configuration

Differences between versions

-Item
+Version from 18:46, 18 April 2024 by Suffusion of Yellow
 Same word list as 1296. Words with too many legitimate uses to disallow outright. Instead, there must be some other clues of vandalism.
 Of course, this will take a huge amount of refinement. Therefore ALWAYS keep the word list in sync with 1296; otherwise it will be impossible to check if any change will cause FPs! --Suffusion of Yellow 23:16 4 Apr 2024
 No harm in tagging for now; already down a few percent false positives. --Suffusion of Yellow 19:51 9 Apr 2024
 Use length of summary, instead of all-or-nothing. Also catch text added to beginning of single line. --Suffusion of Yellow 21:14 9 Apr 2024
 Check for a few common bad words from other filters in sweariness check, also look check for "series" and "studios". Tweak multiple words to check to exclude multiple uses of the first match, unless back-to-back. --Suffusion of Yellow 23:27 10 Apr 2024
 Tweak summary check. --Suffusion of Yellow 00:09 11 Apr 2024
 Check for curly quotes. --Suffusion of Yellow 21:02 11 Apr 2024
 Check for unbalanced markup. --Suffusion of Yellow 19:46 12 Apr 2024
 Check for lines not ending in punctuation (might need tweaking later). --Suffusion of Yellow 20:32 12 Apr 2024
 Just exclude any edits adding references. Also exclude any edits adding links containing the matched word. --Suffusion of Yellow 21:33 12 Apr 2024
 Tweak single-added word check to only check the match; tweak start-or-end-of-page check to account for deleting content. --Suffusion of Yellow 21:09 13 Apr 2024
 Require all-caps string to be part of the match, otherwise matches unrelated acronyms. --Suffusion of Yellow 02:49 16 Apr 2024
 Temporarily set to warn; I want to see what fraction of people click past the warning. 100 out of the last 100 hits (going back about a day) have been reverted. --Suffusion of Yellow 18:29 17 Apr 2024
+Tweak a few checks to be more forgiving, also a crude check for "complex" diffs since disallowing those can be more BITEy. --Suffusion of Yellow 18:46 18 Apr 2024
 sus := "(?x)\b(?:
      #Common words
       a[ ]fraud
      |adolf[ ]hitler
      |amazing
      |anal
      |ass+
      |ahh+
      |bald
      |balls
      |big[ ]black
      |boobs
      |booty
      |bum
      |butt
      |caca
      |cheeks
      |ching[ ]chong
      |cool(?:est)?
      |creeps?
      |cum
      |daddy
      |dumb?
      |fart(?:ed|ing|s)?
      |fat
      |gay(?:est|s)?
      |haha
      |hehe
      |hello
      |hes
      |hola
      |(?<!\S)hi(?!\S)
      |i[ ](?:eat|like|love|hate)
      |idk
      |is[ ](?:bad|fake)
      |is[ ]the[ ](?:best|worst)
      |m[ou]m(?:my)?
      |morons?
      |nonces?
      |nuh
      |oh[ ]no
      |omg
      |pedos?
      |pee+
      |poo+
      |porn(hub|o)?
      |puta
      |racists?
      |retards?
      |sexy
      |scammers?
      |scumbags?
      |smell[ys]
      |stink[ys]
      |stupid
      |subscribe[ ]to
      |suck(?:ed|ing|s)?
      |tits
      |toes
      |turd
      |twat
      |(?<!\S)ur(?!\S)
      |vaginas?
      |yall
      |yummy
      #Memes
      |among[ ]us
      |caseoh
      |fortnite
      |roblox
      |ronaldo
      |sigmas?
      |the[ ]goat
      |womp
 )\b";
 page_namespace == 0 &
 !("confirmed" in user_groups) &
-edit_delta < 1000 &
+edit_delta < 500 &
 (
      match := get_matches("(?i)" + sus, added_lines)[0];
      match & (
          escaped_match := "(?:\b" + rescape(match) + "\b)";
-         !(removed_lines irlike sus) &
+         !((removed_lines + "\n" + page_title) irlike sus) &
-         !((old_wikitext + added_links) irlike escaped_match) & (
+         !((old_wikitext + "\n" + added_links) irlike escaped_match) & (
              ref_cnt := count("<ref", added_lines) - count("<ref", removed_lines);
              ref_cnt <= 0
-         ) &
+         )
-        !(added_links irlike escaped_match)
      )
 ) & (
      /* Baseline AGF */
      score := 0.5;
      /* More AGF on "sweary" pages (no need to include all of 384 here; just what's common in existing pages */
      score := score + 0.25 * rcount("(?i)fuck|\bshit|bitch|" + sus, old_wikitext);
      /* More AGF on fiction or music related pages */
      score := score + 2.0 * (new_wikitext irlike "(?x)
           category:.*(?:films|shows|books|series|studios|episodes|bands|musical[ ]groups|albums|songs)
          |discography
          |filmography
      ");
-     /* Added markup */
+     /* More points for "complex" diffs */
-     markup_cnt := (rcount("[[\]{}|*#=]", added_lines)) - (rcount("[[\]{}|*#=]", removed_lines));
+     score := score + 0.1 * rcount("(?m)^@@", edit_diff);
-     clamped_markup_cnt := markup_cnt < -10 ? -10 : markup_cnt;
-    score := score + 0.1 * clamped_markup_cnt;
+    /* More points for a long summary, unless it's one of the mobile suggestions */
+    sum := get_matches("^(?:/\*.*?\*/)?\s*(.*)\s*$", summary)[1];
+     if !(sum irlike "added content|fixed typo") then (
+        score := score + 0.02 * length(sum);
+    ) end;
      /* Added quotes or italics */
      quote := "(?<!')''(?!')|[\"\x{201C}\x{201D}]";
      quote_cnt := (rcount(quote, added_lines)) - rcount(quote, removed_lines);
      clamped_quote_cnt := quote_cnt < 0 ? 0 : quote_cnt;
      score := score + 0.5 * clamped_quote_cnt;
+    /* Added or removed markup */
+    markup_cnt := (rcount("[[\]{}|*#=]", added_lines)) - (rcount("[[\]{}|*#=]", removed_lines));
+    clamped_markup_cnt := markup_cnt < -10 ? -10 : markup_cnt;
+    score := score + 0.1 * clamped_markup_cnt;
      /* Removed references */
      score := score - 2.0 * (ref_cnt < 0);
-    /* More points for a long summary, unless it's one of the mobile suggestions */
-    sum := get_matches("^(?:/\*.*?\*/)?\s*(.*)\s*$", summary)[1];
-    if !(sum irlike "added content|fixed typo") then (
-        score := score + 0.02 * length(sum);
-    ) end;
      /* Unencyclopedic language */
      bonus_words := "\b(?:i|me|my|your?)\b";
      score := score - 0.5 * (
          added_lines irlike bonus_words &
          !(removed_lines irlike bonus_words) &
          !(match irlike bonus_words) /* Avoid double-counting "i like", etc. */
      );
      /* Did they add the matched word and do nothing else except adjust whitespace and punctuation? */
-     score := score - 2.0 * (norm(str_replace_regexp(added_lines, match, "")) == norm(removed_lines));
+     score := score - 2.0 * (rmwhitespace(rmspecials(str_replace_regexp(added_lines, match, ""))) == rmwhitespace(rmspecials(removed_lines)));
      /* Multiple bad words (different from the first match; if that has one legit. use, it has many) */
      extra_cnt := rcount("(?i)(?!" + escaped_match + ")" + sus, added_lines);
      clamped_extra_cnt := extra_cnt > 4 ? 4 : extra_cnt;
      score := score - 0.5 * clamped_extra_cnt;
      /* Back-to-back bad words */
      score := score - 2.5 * (added_lines irlike ("(?:(?:" + sus + ")\W*){2}"));
      /* If the word count is exactly the same, that probably means they just swapped out one word, or did a search-and-replace */
      score := score - 1.0 * (rcount("\w+", added_lines) == rcount("\w+", removed_lines));
      /* Match contains at least one all-caps word */
      score := score - 1.0 * (match rlike '\b[A-Z]{2,}\b');
      if (!quote_cnt & !markup_cnt) then (
          /* Added plain text to the end of a single line */
          score := score - 1.0 * (
                  strpos(rmwhitespace(added_lines), rmwhitespace(removed_lines)) != -1 &
                  length(added_lines) == 1 &
                  length(removed_lines) == 1
          );
          /* No changes at all to punctuation or markup */
          score := score - 2.0 * (str_replace_regexp(added_lines, "[\w\s]", "") == str_replace_regexp(removed_lines, "[\w\s]", ""));
      ) end;
      /* Not much use of the shift key */
      score := score - 1.0 * (rcount('[A-Z]', added_lines) == rcount('[A-Z]', removed_lines));
      /* Red link containing the specific word they added */
      score := score - 2.0 * (new_html irlike ('class="new" title="[^"]*' + escaped_match));
      /* Unbalanced markup */
      score := score - 2.0 * (
          rcount("[{\[]", added_lines) != rcount("[}\]]", added_lines) &
          rcount("[{\[]", old_wikitext) == rcount("[}\]]", old_wikitext)
      );
      /* Nothing but letters and spaces between matched word and end of a line */
      score := score - 2.0 * (
          added_lines irlike ("(?m)" + escaped_match + "[a-z ]*$")
      );
      /* No template or link markup between matched word and start or end of the page */
      score := score - 2.0 * (
          new_wikitext irlike (escaped_match + "[^}\]]*$|^[^{\[]*" + escaped_match)
      );
      score < 0
 )

Older changeNewer change