script-spambot-cleanup 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. #!/usr/bin/perl
  2. use local::lib;
  3. use strict;
  4. use v5.10;
  5. use warnings;
  6. use FindBin qw( $RealBin );
  7. use lib "$RealBin/lib";
  8. BEGIN {
  9. $ENV{TZ} = 'UTC';
  10. }
  11. $| = 1;
  12. use LogBot::Config qw( find_config load_config );
  13. use LogBot::Database qw( dbh execute_with_retry );
  14. use LogBot::Util qw( event_to_string path_for plural touch );
  15. use Mojo::File qw( path );
  16. use Mojo::Util qw( trim );
  17. # remove greeting spambots from the mozilla network database
  18. my $start_time = 1_534_710_129;
  19. my $end_time = time() - 60 * 60;
  20. my @greetings = map { trim($_) } split(/\n/, <<'EOF');
  21. eh!
  22. hello
  23. hello!
  24. hey
  25. hey!
  26. hi
  27. hi!
  28. hola
  29. hola!
  30. holaa
  31. holaaaaaaa
  32. oye!
  33. tiens
  34. hé!
  35. i'm here
  36. EOF
  37. my $config = load_config(find_config('mozilla'));
  38. my $dbh = dbh($config, read_write => 1);
  39. my $greetings = join(',', map { $dbh->quote($_) } @greetings);
  40. my $start_file = path_for($config, 'store') . '/spambot-cleanup';
  41. if (-e $start_file) {
  42. $start_time = trim(path($start_file)->slurp);
  43. }
  44. my $sql = "
  45. SELECT DISTINCT LOWER(nick)
  46. FROM logs
  47. WHERE (time BETWEEN $start_time AND $end_time)
  48. AND (text COLLATE NOCASE IN ($greetings))
  49. ORDER BY time";
  50. my %nicks = map { $_ => 1 } @{ $dbh->selectcol_arrayref($sql) };
  51. say 'found ', plural(scalar(keys %nicks), 'suspect');
  52. exit unless scalar(keys %nicks);
  53. my $excluded = 0;
  54. foreach my $nick (sort keys %nicks) {
  55. my ($count) = $dbh->selectrow_array(
  56. "SELECT COUNT(*)
  57. FROM logs
  58. WHERE (time BETWEEN $start_time AND $end_time)
  59. AND (nick = ? COLLATE NOCASE)
  60. AND (text COLLATE NOCASE NOT IN ($greetings))",
  61. undef,
  62. $nick
  63. );
  64. if ($count) {
  65. delete $nicks{$nick};
  66. $excluded++;
  67. }
  68. }
  69. say 'excluded ', plural($excluded, 'account'), ', leaving ', plural(scalar(keys %nicks), 'spammer');
  70. my $sth = $dbh->prepare(
  71. "SELECT *
  72. FROM logs
  73. WHERE (time BETWEEN $start_time and $end_time)
  74. AND (nick COLLATE NOCASE IN (" . join(',', map { $dbh->quote($_) } sort keys %nicks) . "))
  75. AND (text COLLATE NOCASE IN ($greetings))
  76. ORDER BY time"
  77. );
  78. $sth->execute();
  79. my $dirty = 0;
  80. while (my $event = $sth->fetchrow_hashref) {
  81. say event_to_string($event);
  82. $dirty = 1;
  83. execute_with_retry(
  84. $config,
  85. sub {
  86. my ($_dbh) = @_;
  87. $_dbh->do('DELETE FROM logs WHERE id = ?', undef, $event->{id});
  88. return 1;
  89. }
  90. ) // die;
  91. }
  92. if ($dirty) {
  93. touch($config->{_derived}->{file});
  94. }
  95. path($start_file)->spurt("$end_time\n");