#!/usr/bin/perl # update channel metadata, etc use local::lib; use v5.10; use strict; use warnings; use FindBin qw( $RealBin ); use lib "$RealBin/lib"; BEGIN { $ENV{TZ} = 'UTC'; $ENV{CRON} = !$ENV{DEBUG}; } use Cpanel::JSON::XS qw( encode_json ); use DateTime (); use List::Util qw( any ); use LogBot::Config qw( find_config load_all_configs load_config reload_config save_config ); use LogBot::Database qw( dbh replace_sql_placeholders ); use LogBot::Util qw( file_for logbot_init nick_is_bot round spurt ); use LogBot::Web::Colour qw( nick_hash ); use Mojo::Log (); use Readonly; # channels with activity are archived after 6 months. Readonly::Scalar my $ARCHIVE_TIME => 60 * 60 * 24 * (365 / 2); # newly logged channels have 5 days for activity before they are deleted. # a channel is considered empty if there's fewer than EMPTY_THRESHOLD messages. Readonly::Scalar my $EMPTY_THRESHOLD => 5; Readonly::Scalar my $DELETE_TIME => 60 * 60 * 24 * 5; my @configs; if (@ARGV && $ARGV[0] eq '--all') { shift; @configs = values %{ load_all_configs() }; } else { push @configs, load_config(find_config(shift)); } @configs || die "syntax: syntax: logbot-nightly \n"; my $log = Mojo::Log->new(path => "$RealBin/log/nightly.log"); foreach my $config (sort { $a->{name} cmp $b->{name} } @configs) { logbot_init($config, name => 'logbot-nightly', quiet => 1); # generate meta data generate_meta($config) unless $ENV{NO_META}; # stop logging and leave stale channels say $config->{name}, ': archiving stale channels' if $ENV{DEBUG}; archive_stale_channels($config); # remove orphaned data say $config->{name}, ': removing orphaned entries' if $ENV{DEBUG}; remove_orphaned_entries($config); } sub generate_meta { my ($config) = @_; # generate network meta (all channels) generate_channel_meta($config, undef); generate_hours_meta($config, undef); foreach my $channel ('_empty', sort keys %{ $config->{channels} }) { next if $config->{channels}->{$channel}->{no_logs}; say $config->{name}, ': ', $channel if $ENV{DEBUG}; # generate data for about #channel generate_channel_meta($config, $channel); generate_hours_meta($config, $channel); generate_nicks_meta($config, $channel); } delete $config->{channels}->{_empty}; } sub generate_channel_meta { my ($config, $channel) = @_; my $dbh = dbh($config); my $now = DateTime->now(); my $output_file = file_for($config, 'meta', $channel, 'meta'); my $six_months = $now->clone()->subtract(months => 6); my $one_month = $now->clone()->subtract(months => 1); my $meta = {}; if ($channel) { #<<< $meta->{first_time} = $dbh->selectrow_array( 'SELECT time FROM logs WHERE channel = ? ORDER BY time ASC LIMIT 1', undef, $channel ); $meta->{active_events} = $dbh->selectrow_array( 'SELECT COUNT(*) FROM logs WHERE channel = ? AND time >= ' . $six_months->epoch, undef, $channel ); $meta->{active_nicks} = $dbh->selectrow_array( 'SELECT COUNT(*) FROM (' . 'SELECT DISTINCT nick FROM logs WHERE channel = ? AND time >= ' . $six_months->epoch . ')', undef, $channel ); $meta->{event_count} = $dbh->selectrow_array( 'SELECT COUNT(*) FROM logs WHERE channel = ?', undef, $channel ); #>>> } else { #<<< $meta->{first_time} = $dbh->selectrow_array( 'SELECT time FROM logs ORDER BY time ASC LIMIT 1' ); $meta->{active_events} = $dbh->selectrow_array( 'SELECT COUNT(*) FROM logs WHERE time >= ' . $six_months->epoch ); $meta->{active_nicks} = $dbh->selectrow_array( 'SELECT COUNT(*) FROM (' . 'SELECT DISTINCT nick FROM logs WHERE time >= ' . $six_months->epoch . ')' ); $meta->{event_count} = $dbh->selectrow_array( 'SELECT COUNT(*) FROM logs' ); #>>> } $meta->{active_events_days} = $six_months->delta_days($now)->in_units('days'); # because we calculate the event count nightly for the network-wide stats, # round to nearest 1000 to make it clear it's an approximation if (!$channel) { $meta->{event_count} = round($meta->{event_count} / 1000) * 1000; } say ' ', encode_json($meta) if $ENV{DEBUG}; spurt($output_file, encode_json($meta)); } sub generate_hours_meta { my ($config, $channel) = @_; my $dbh = dbh($config); my $output_file = file_for($config, 'meta', $channel, 'hours'); my $sql = 'SELECT CAST(time % (24 * 3600) AS INT), COUNT(*) FROM logs'; $sql .= ' WHERE channel = ?' if $channel; $sql .= ' GROUP BY CAST(time % (24 * 3600) AS INT)'; my @values; push @values, $channel if $channel; say ' ', replace_sql_placeholders($dbh, $sql, \@values) if $ENV{DEBUG}; my %values; foreach my $row (@{ $dbh->selectall_arrayref($sql, undef, @values) }) { my ($ss, $value) = @{$row}; my $hh = sprintf('%.1f', $ss / (60 * 60)); $hh = '0.0' if $hh == 24; $values{$hh} += $value; } my @data; for (my $hh = 0; $hh < 24; $hh += 0.1) { $hh = sprintf('%.1f', $hh); push @data, [$hh * 1.0, ($values{$hh} || 0) * 1]; } my $json = { data => \@data, lines => { show => \1, fill => \1, }, }; spurt($output_file, encode_json($json)); } sub generate_nicks_meta { my ($config, $channel) = @_; my $dbh = dbh($config); my $output_file = file_for($config, 'meta', $channel, 'nicks'); my $sql = 'SELECT nick, COUNT(*) AS count FROM logs'; $sql .= ' WHERE channel = ?' if $channel; $sql .= ' GROUP BY nick'; $sql .= ' ORDER BY COUNT(*) DESC, nick ASC'; $sql .= ' LIMIT 20'; my @values; push @values, $channel if $channel; say ' ', replace_sql_placeholders($dbh, $sql, \@values) if $ENV{DEBUG}; my $data = $dbh->selectall_arrayref($sql, { Slice => {} }, @values); my $json = []; foreach my $event (@{$data}) { $event->{hash} = nick_hash($event->{nick}); $event->{bot} = nick_is_bot($config, $event->{nick}); push @{$json}, $event; } spurt($output_file, encode_json($json)); } sub archive_stale_channels { my ($config) = @_; my $now = time(); my $reload = 0; foreach my $channel (sort keys %{ $config->{channels} }) { next if $config->{channels}->{$channel}->{disabled}; next if $config->{channels}->{$channel}->{archived}; my $dbh = dbh($config); # some messages are always ignored when determining channel activity my @ignore; # notices from the irc network itself # just stripping ^irc. isn't ideal but works for most networks. (my $network = $config->{irc}->{host}) =~ s/^irc\.//; $network =~ s/:\d+$//; push @ignore, "type = 2 AND nick LIKE '%.$network'"; # ignore our own messages push @ignore, 'nick = ' . $dbh->quote($config->{irc}->{nick}); my $where = 'channel = ' . $dbh->quote($channel) . ' AND NOT ((' . join(') OR (', @ignore) . '))'; # empty channels my $count = $dbh->selectrow_array('SELECT COUNT(*) FROM logs WHERE ' . $where); if ($count < $EMPTY_THRESHOLD) { my $first_time = $dbh->selectrow_array('SELECT time FROM logs WHERE channel = ? ORDER BY time LIMIT 1', undef, $channel); next if $first_time && $now - $first_time <= $DELETE_TIME; my $msg = $config->{name} . ": channel $channel is empty ($count) and stale, deleting"; $log->info($msg); say $msg if $ENV{DEBUG}; $config = reload_config($config); delete $config->{channels}->{$channel}; save_config($config); next; } # stale channels my $last_time = $dbh->selectrow_array('SELECT time FROM logs WHERE ' . $where . ' ORDER BY time DESC LIMIT 1'); next unless $last_time; if ($now - $last_time > $ARCHIVE_TIME) { my $msg = $config->{name} . ": $channel is stale, archiving"; $log->info($msg); say $msg if $ENV{DEBUG}; $config = reload_config($config); $config->{channels}->{$channel}->{archived} = 1; save_config($config); next; } } if ($reload) { system("$RealBin/logbot-irc", $config->{_derived}->{file}, '--reload'); } } sub remove_orphaned_entries { my ($config) = @_; my $dbh = dbh($config, read_write => 1); my @channels = sort keys %{ $config->{channels} }; my $db_channels = $dbh->selectcol_arrayref('SELECT DISTINCT channel FROM logs ORDER BY channel'); my @orphans; foreach my $channel (@{$db_channels}) { next if any { $_ eq $channel } @channels; push @orphans, $channel; } if (@orphans) { say 'deleting orphaned channels: ', join(', ', @orphans) if $ENV{DEBUG}; $dbh->do('DELETE FROM logs WHERE channel in (' . join(',', ('?') x scalar(@orphans)) . ')', undef, @orphans); } $dbh->do('DELETE FROM topics WHERE NOT (channel in (' . join(',', ('?') x scalar(@channels)) . '))', undef, @channels); }