Edit filter configuration

Differences between versions

ItemVersion from 01:03, 13 April 2024 by Suffusion of YellowVersion from 21:10, 13 April 2024 by Suffusion of Yellow
Basic information
Notes:
Same word list as 1296. Words with too many legitimate uses to disallow outright. Instead, there must be some other clues of vandalism.
Same word list as 1296. Words with too many legitimate uses to disallow outright. Instead, there must be some other clues of vandalism.


Of course, this will take a huge amount of refinement. Therefore ALWAYS keep the word list in sync with 1296; otherwise it will be impossible to check if any change will cause FPs! --Suffusion of Yellow 23:16 4 Apr 2024
Of course, this will take a huge amount of refinement. Therefore ALWAYS keep the word list in sync with 1296; otherwise it will be impossible to check if any change will cause FPs! --Suffusion of Yellow 23:16 4 Apr 2024


No harm in tagging for now; already down a few percent false positives. --Suffusion of Yellow 19:51 9 Apr 2024
No harm in tagging for now; already down a few percent false positives. --Suffusion of Yellow 19:51 9 Apr 2024


Use length of summary, instead of all-or-nothing. Also catch text added to beginning of single line. --Suffusion of Yellow 21:14 9 Apr 2024
Use length of summary, instead of all-or-nothing. Also catch text added to beginning of single line. --Suffusion of Yellow 21:14 9 Apr 2024


Check for a few common bad words from other filters in sweariness check, also look check for "series" and "studios". Tweak multiple words to check to exclude multiple uses of the first match, unless back-to-back. --Suffusion of Yellow 23:27 10 Apr 2024
Check for a few common bad words from other filters in sweariness check, also look check for "series" and "studios". Tweak multiple words to check to exclude multiple uses of the first match, unless back-to-back. --Suffusion of Yellow 23:27 10 Apr 2024


Tweak summary check. --Suffusion of Yellow 00:09 11 Apr 2024
Tweak summary check. --Suffusion of Yellow 00:09 11 Apr 2024


Check for curly quotes. --Suffusion of Yellow 21:02 11 Apr 2024
Check for curly quotes. --Suffusion of Yellow 21:02 11 Apr 2024


Check for unbalanced markup. --Suffusion of Yellow 19:46 12 Apr 2024
Check for unbalanced markup. --Suffusion of Yellow 19:46 12 Apr 2024


Check for lines not ending in punctuation (might need tweaking later). --Suffusion of Yellow 20:32 12 Apr 2024
Check for lines not ending in punctuation (might need tweaking later). --Suffusion of Yellow 20:32 12 Apr 2024


Just exclude any edits adding references. Also exclude any edits adding links containing the matched word. --Suffusion of Yellow 21:33 12 Apr 2024
Just exclude any edits adding references. Also exclude any edits adding links containing the matched word. --Suffusion of Yellow 21:33 12 Apr 2024
Tweak single-added word check to only check the match; tweak start-or-end-of-page check to account for deleting content. --Suffusion of Yellow 21:09 13 Apr 2024
Filter conditions
Conditions:
(documentation)
sus := "(?x)\b(?:
sus := "(?x)\b(?:
     #Common words
     #Common words
     a[ ]fraud
     a[ ]fraud
     |adolf[ ]hitler
     |adolf[ ]hitler
     |amazing
     |amazing
     |anal
     |anal
     |ass+
     |ass+
     |ahh+
     |ahh+
     |bald
     |bald
     |balls
     |balls
     |big[ ]black
     |big[ ]black
     |boobs
     |boobs
     |booty
     |booty
     |bum
     |bum
     |butt
     |butt
     |caca
     |caca
     |cheeks
     |cheeks
     |ching[ ]chong
     |ching[ ]chong
     |cool(?:est)?
     |cool(?:est)?
     |creeps?
     |creeps?
     |cum
     |cum
     |daddy
     |daddy
     |dumb?
     |dumb?
     |fart(?:ed|ing|s)?
     |fart(?:ed|ing|s)?
     |fat
     |fat
     |gay(?:est|s)?
     |gay(?:est|s)?
     |haha
     |haha
     |hehe
     |hehe
     |hello
     |hello
     |hes
     |hes
     |hola
     |hola
     |(?<!\S)hi(?!\S)
     |(?<!\S)hi(?!\S)
     |i[ ](?:eat|like|love|hate)
     |i[ ](?:eat|like|love|hate)
     |idk
     |idk
     |is[ ](?:bad|fake)
     |is[ ](?:bad|fake)
     |is[ ]the[ ](?:best|worst)
     |is[ ]the[ ](?:best|worst)
     |m[ou]m(?:my)?
     |m[ou]m(?:my)?
     |morons?
     |morons?
     |nonces?
     |nonces?
     |nuh
     |nuh
     |oh[ ]no
     |oh[ ]no
     |omg
     |omg
     |pedos?
     |pedos?
     |pee+
     |pee+
     |poo+
     |poo+
     |porn(hub|o)?
     |porn(hub|o)?
     |puta
     |puta
     |racists?
     |racists?
     |retards?
     |retards?
     |sexy
     |sexy
     |scammers?
     |scammers?
     |scumbags?
     |scumbags?
     |smell[ys]  
     |smell[ys]  
     |stink[ys]
     |stink[ys]
     |stupid
     |stupid
     |subscribe[ ]to
     |subscribe[ ]to
     |suck(?:ed|ing|s)?
     |suck(?:ed|ing|s)?
     |tits
     |tits
     |toes
     |toes
     |turd
     |turd
     |twat
     |twat
     |(?<!\S)ur(?!\S)
     |(?<!\S)ur(?!\S)
     |vaginas?
     |vaginas?
     |yall
     |yall
     |yummy
     |yummy


     #Memes
     #Memes
     |among[ ]us
     |among[ ]us
     |caseoh
     |caseoh
     |fortnite
     |fortnite
     |roblox
     |roblox
     |ronaldo
     |ronaldo
     |sigmas?
     |sigmas?
     |the[ ]goat
     |the[ ]goat
     |womp
     |womp
)\b";
)\b";


page_namespace == 0 &
page_namespace == 0 &
!("confirmed" in user_groups) &
!("confirmed" in user_groups) &
edit_delta < 1000 &
edit_delta < 1000 &
(
(
     match := get_matches("(?i)" + sus, added_lines)[0];
     match := get_matches("(?i)" + sus, added_lines)[0];
     match & (
     match & (
         escaped_match := "(?:\b" + rescape(match) + "\b)";
         escaped_match := "(?:\b" + rescape(match) + "\b)";
        
        
         !(removed_lines irlike sus) &
         !(removed_lines irlike sus) &
         !((old_wikitext + added_links) irlike escaped_match) & (
         !((old_wikitext + added_links) irlike escaped_match) & (
             ref_cnt := count("<ref", added_lines) - count("<ref", removed_lines);
             ref_cnt := count("<ref", added_lines) - count("<ref", removed_lines);
             ref_cnt <= 0
             ref_cnt <= 0
         ) &
         ) &
         !(added_links irlike escaped_match)
         !(added_links irlike escaped_match)
     )
     )
) & (
) & (
     /* Baseline AGF */
     /* Baseline AGF */
     score := 0.5;
     score := 0.5;


     /* More AGF on "sweary" pages (no need to include all of 384 here; just what's common in existing pages */
     /* More AGF on "sweary" pages (no need to include all of 384 here; just what's common in existing pages */
     score := score + 0.25 * rcount("(?i)fuck|\bshit|bitch|" + sus, old_wikitext);
     score := score + 0.25 * rcount("(?i)fuck|\bshit|bitch|" + sus, old_wikitext);


     /* More AGF on fiction or music related pages */
     /* More AGF on fiction or music related pages */
     score := score + 2.0 * (new_wikitext irlike "(?x)
     score := score + 2.0 * (new_wikitext irlike "(?x)
         category:.*(?:films|shows|books|series|studios|episodes|bands|musical[ ]groups|albums|songs)
         category:.*(?:films|shows|books|series|studios|episodes|bands|musical[ ]groups|albums|songs)
         |discography
         |discography
         |filmography
         |filmography
     ");  
     ");  
      
      
     /* Added markup */
     /* Added markup */
     markup_cnt := (rcount("[[\]{}|*#=]", added_lines)) - (rcount("[[\]{}|*#=]", removed_lines));
     markup_cnt := (rcount("[[\]{}|*#=]", added_lines)) - (rcount("[[\]{}|*#=]", removed_lines));
     clamped_markup_cnt := markup_cnt < -10 ? -10 : markup_cnt;
     clamped_markup_cnt := markup_cnt < -10 ? -10 : markup_cnt;
     score := score + 0.1 * clamped_markup_cnt;
     score := score + 0.1 * clamped_markup_cnt;


     /* Added quotes or italics */
     /* Added quotes or italics */
     quote := "(?<!')''(?!')|[\"\x{201C}\x{201D}]";
     quote := "(?<!')''(?!')|[\"\x{201C}\x{201D}]";
     quote_cnt := (rcount(quote, added_lines)) - rcount(quote, removed_lines);
     quote_cnt := (rcount(quote, added_lines)) - rcount(quote, removed_lines);
     clamped_quote_cnt := quote_cnt < 0 ? 0 : quote_cnt;
     clamped_quote_cnt := quote_cnt < 0 ? 0 : quote_cnt;
     score := score + 0.5 * clamped_quote_cnt;
     score := score + 0.5 * clamped_quote_cnt;


     /* Removed references */
     /* Removed references */
     score := score - 2.0 * (ref_cnt < 0);
     score := score - 2.0 * (ref_cnt < 0);


     /* More points for a long summary, unless it's one of the mobile suggestions */
     /* More points for a long summary, unless it's one of the mobile suggestions */
     sum := get_matches("^(?:/\*.*?\*/)?\s*(.*)\s*$", summary)[1];
     sum := get_matches("^(?:/\*.*?\*/)?\s*(.*)\s*$", summary)[1];
     if !(sum irlike "added content|fixed typo") then (
     if !(sum irlike "added content|fixed typo") then (
         score := score + 0.02 * length(sum);
         score := score + 0.02 * length(sum);
     ) end;
     ) end;


     /* Unencyclopedic language */
     /* Unencyclopedic language */
     bonus_words := "\b(?:i|me|my|your?)\b";
     bonus_words := "\b(?:i|me|my|your?)\b";
     score := score - 0.5 * (
     score := score - 0.5 * (
         added_lines irlike bonus_words &  
         added_lines irlike bonus_words &  
         !(removed_lines irlike bonus_words) &
         !(removed_lines irlike bonus_words) &
         !(match irlike bonus_words) /* Avoid double-counting "i like", etc. */
         !(match irlike bonus_words) /* Avoid double-counting "i like", etc. */
     );
     );


     /* Did they add these words and do nothing else except adjust whitespace and punctuation? */
     /* Did they add the matched word and do nothing else except adjust whitespace and punctuation? */
     score := score - 2.0 * (norm(str_replace_regexp(added_lines, sus, "")) == norm(removed_lines));
     score := score - 2.0 * (norm(str_replace_regexp(added_lines, match, "")) == norm(removed_lines));


     /* Multiple bad words (different from the first match; if that has one legit. use, it has many) */
     /* Multiple bad words (different from the first match; if that has one legit. use, it has many) */
     extra_cnt := rcount("(?i)(?!" + escaped_match + ")" + sus, added_lines);
     extra_cnt := rcount("(?i)(?!" + escaped_match + ")" + sus, added_lines);
     clamped_extra_cnt := extra_cnt > 4 ? 4 : extra_cnt;
     clamped_extra_cnt := extra_cnt > 4 ? 4 : extra_cnt;
     score := score - 0.5 * clamped_extra_cnt;
     score := score - 0.5 * clamped_extra_cnt;
      
      
     /* Back-to-back bad words */
     /* Back-to-back bad words */
     score := score - 2.5 * (added_lines irlike ("(?:(?:" + sus + ")\W*){2}"));
     score := score - 2.5 * (added_lines irlike ("(?:(?:" + sus + ")\W*){2}"));


     /* If the word count is exactly the same, that probably means they just swapped out one word, or did a search-and-replace */
     /* If the word count is exactly the same, that probably means they just swapped out one word, or did a search-and-replace */
     score := score - 1.0 * (rcount("\w+", added_lines) == rcount("\w+", removed_lines));
     score := score - 1.0 * (rcount("\w+", added_lines) == rcount("\w+", removed_lines));


     /* Anything in all caps, not necessarily these words */
     /* Anything in all caps, not necessarily these words */
     score := score - 1.0 * (rcount("[A-Z]{4,}", added_lines) > rcount("[A-Z]{4,}", removed_lines));
     score := score - 1.0 * (rcount("[A-Z]{4,}", added_lines) > rcount("[A-Z]{4,}", removed_lines));


     if (!quote_cnt & !markup_cnt) then (
     if (!quote_cnt & !markup_cnt) then (
         /* Added plain text to the end of a single line */
         /* Added plain text to the end of a single line */
         score := score - 1.0 * (
         score := score - 1.0 * (
                 strpos(rmwhitespace(added_lines), rmwhitespace(removed_lines)) != -1 &
                 strpos(rmwhitespace(added_lines), rmwhitespace(removed_lines)) != -1 &
                 length(added_lines) == 1 &
                 length(added_lines) == 1 &
                 length(removed_lines) == 1
                 length(removed_lines) == 1
         );
         );


        /* Added plain text to the start or end of the page */
        score := score - 2.0 * (strpos(rmwhitespace(new_wikitext), rmwhitespace(old_wikitext)) != -1);
       
         /* No changes at all to punctuation or markup */
         /* No changes at all to punctuation or markup */
         score := score - 2.0 * (str_replace_regexp(added_lines, "[\w\s]", "") == str_replace_regexp(removed_lines, "[\w\s]", ""));
         score := score - 2.0 * (str_replace_regexp(added_lines, "[\w\s]", "") == str_replace_regexp(removed_lines, "[\w\s]", ""));
     ) end;
     ) end;
      
      
     /* Not much use of the shift key */
     /* Not much use of the shift key */
     score := score - 1.0 * (rcount('[A-Z]', added_lines) == rcount('[A-Z]', removed_lines));
     score := score - 1.0 * (rcount('[A-Z]', added_lines) == rcount('[A-Z]', removed_lines));
      
      
     /* Red link containing the specific word they added */
     /* Red link containing the specific word they added */
     score := score - 2.0 * (new_html irlike ('class="new" title="[^"]*' + escaped_match));
     score := score - 2.0 * (new_html irlike ('class="new" title="[^"]*' + escaped_match));


     /* Unbalanced markup */
     /* Unbalanced markup */
     score := score - 2.0 * (
     score := score - 2.0 * (
         rcount("[{\[]", added_lines) != rcount("[}\]]", added_lines) &
         rcount("[{\[]", added_lines) != rcount("[}\]]", added_lines) &
         rcount("[{\[]", old_wikitext) == rcount("[}\]]", old_wikitext)
         rcount("[{\[]", old_wikitext) == rcount("[}\]]", old_wikitext)
     );
     );


     /* No punctuation or numbers between matched word and end of the line */
     /* Nothing but letters and spaces between matched word and end of a line */
     score := score - 2.0 * (
     score := score - 2.0 * (
         added_lines irlike ("(?m)" + escaped_match + "[a-z ]*$")
         added_lines irlike ("(?m)" + escaped_match + "[a-z ]*$")
    );
    /* No template or link markup between matched word and start or end of the page */
    score := score - 2.0 * (
        new_wikitext irlike (escaped_match + "[^}\]]*$|^[^{\[]*" + escaped_match)
     );
     );


     score < 0
     score < 0
)
)