#!/usr/bin/perl
use 5.016;
use strict;
use warnings;

use Digest;
use Encode qw(encode_utf8);
use File::Temp qw(tempfile);
use Getopt::Long;
use List::Util qw(max min uniq);
use POSIX qw(strftime);
use Time::Piece;
use Time::Seconds;

use XML::LibXML;

use constant {
    SELECTOR_STRING      => 0,
    SELECTOR_SINGLE_TEXT => 1,
    SELECTOR_MULTI_TEXT  => 2,
    SELECTOR_SINGLE_HTML => 3,
    SELECTOR_MULTI_HTML  => 4,
};

my $USAGE = <<"HERE";
Usage:
  html2rss.pl [options] config [location]

Options:
  -a          Generate atom feed
  -r          Generate RSS 2.0 feed
  -o <file>   Output feed to <file>
  -h          Print this usage message
HERE

my $NOW = localtime;

# TODO: If title is SELECTOR_STRING, make it unique somehow?
# TODO: Allow combining selector statements? like:
#   SlackBuilds.org: [.//div[@class="whatever"]]

my $DEFAULT_AGENT = "html2rss.pl ($^O; perl $^V)";

my %DAY_MAP = (
    0 => 'Sunday',
    1 => 'Monday',
    2 => 'Tuesday',
    3 => 'Wednesday',
    4 => 'Thursday',
    5 => 'Friday',
    6 => 'Saturday',
);

my %DAY_STRINGS = (
    '0'         => 0,
    '7'         => 0,
    'sun'       => 0,
    'sunday'    => 0,
    '1'         => 1,
    'mon'       => 1,
    'monday'    => 1,
    '2'         => 2,
    'tue'       => 2,
    'tuesday'   => 2,
    '3'         => 3,
    'wed'       => 3,
    'wednesday' => 3,
    '4'         => 4,
    'thu'       => 4,
    'thursday'  => 4,
    '5'         => 5,
    'fri'       => 5,
    'friday'    => 5,
    '6'         => 6,
    'sat'       => 6,
    'saturday'  => 6,
);

my %FEED_FORMATS = map { $_ => 1 } qw(
    atom
    rss
);

sub read_config {

    my ($file) = @_;

    my $config = {};

    open my $fh, '<', $file
        or die "Failed to open $file for reading: $!\n";

    my $section = undef;
    my $ln = 0;
    while (my $l = readline $fh) {
        $ln++;
        chomp $l;
        if ($l =~ /^#/ or $l =~ /^\s*$/) {
            next;
        }
        if ($l =~ /^\[(\w+)\]$/) {
            $section = $1;
            $config->{ $section } //= {};
            next;
        }
        if (not defined $section) {
            die "$file $ln: key-value pair not under section\n";
        }
        $l =~ s/^\s+|\s+$//g;
        my ($key, $value) = split /\s*=\s*/, $l, 2;
        if ($key !~ /^\w+$/) {
            die "$file $ln: '$key' key contains invalid characters\n";
        }
        $config->{ $section }{ $key } = $value;
    }

    close $fh;

    return $config;

}

sub curl_to {

    my ($url, $to, %param) = @_;
    my $agent = $param{ user_agent };

    my @args;

    if (defined $agent) {
        push @args, '-A', $agent;
    }

    push @args, '-s', '--show-error';
    push @args, '-o', $to;
    push @args, $url;

    system 'curl', @args;

    if ($? == -1 or $? >> 8 != 0) {
        die "Failed to curl '$url' to $to\n";
    }

    return 1;

}

sub id_selector {

    my ($str) = @_;

    if ($str =~ /^\[\[(.*)\]\]$/) {
        return ($1, SELECTOR_MULTI_TEXT);
    } elsif ($str =~ /^<<(.*)>>$/) {
        return ($1, SELECTOR_MULTI_HTML);
    } elsif ($str =~ /^\[(.*)\]$/) {
        return ($1, SELECTOR_SINGLE_TEXT);
    } elsif ($str =~ /^<(.*)>$/) {
        return ($1, SELECTOR_SINGLE_HTML);
    } else {
        return ($str, SELECTOR_STRING);
    }

}

sub selector_str_eq {

    my ($selector, $string) = @_;

    return 0 if $selector->[1] != SELECTOR_STRING;
    return $selector->[0] eq $string;

}

sub truncate_week {

    my ($time) = @_;

    $time = $time->truncate(to => 'day');

    while ($time->wday != 1) {
        $time -= ONE_DAY;
    }

    return $time;

}

sub format_atom_time {

    my ($time) = @_;

    return strftime('%Y-%m-%dT%H:%M:%SZ', gmtime $time);

}

sub format_rss_time {

    my ($time) = @_;

    return strftime('%a, %d %b %Y %H:%M:%S -0000', gmtime $time);

}

sub generate_item_id {

    my ($item, $salt) = @_;

    my $sha = Digest->new('SHA-256');
    if (defined $salt) {
        $sha->add(encode_utf8($salt));
    }

    if (not defined $item->{ title }) {
        die '$item missing title';
    }
    if (not defined $item->{ link }) {
        die '$item missing link';
    }

    $sha->add(encode_utf8($item->{ title }));
    $sha->add(encode_utf8($item->{ link }));

    return $sha->hexdigest;

}

sub generate_feed_id {

    my ($feed, $salt) = @_;

    my $sha = Digest->new('SHA-256');
    if (defined $salt) {
        $sha->add(encode_utf8($salt));
    }

    if (not defined $feed->{ title }) {
        die '$feed missing title';
    }
    if (not defined $feed->{ link }) {
        die '$feed missing link';
    }

    $sha->add(encode_utf8($feed->{ title }));
    $sha->add(encode_utf8($feed->{ link }));

    return $sha->hexdigest;

}

sub generate_atom_item {

    my ($item) = @_;

    if (ref $item ne 'HASH') {
        die '$item is not a hash ref';
    }

    my $node = XML::LibXML::Element->new('entry');

    if (not defined $item->{ title }) {
        die "item missing title\n";
    }

    my $title = $node->appendChild(
        XML::LibXML::Element->new('title')
    );
    $title->appendChild(
        XML::LibXML::Text->new($item->{ title })
    );

    if (not defined $item->{ link }) {
        die "item missing link\n";
    }

    my $link = $node->appendChild(
        XML::LibXML::Element->new('link')
    );
    $link->setAttribute('href', $item->{ link });

    my $id = $node->appendChild(
        XML::LibXML::Element->new('id')
    );
    $id->appendChild(
        XML::LibXML::Text->new(generate_item_id($item))
    );

    if (defined $item->{ summary }) {
        my $summary = $node->appendChild(
            XML::LibXML::Element->new('summary')
        );
        $summary->setAttribute('type', $item->{ summary_html } ? 'html' : 'text');
        $summary->appendChild(
            XML::LibXML::Text->new($item->{ summary })
        );
    } else {
        my $summary = $node->appendChild(
            XML::LibXML::Element->new('summary')
        );
        $summary->setAttribute('type', 'text');
        $summary->appendChild(
            XML::LibXML::Text->new('')
        );
    }

    if (defined $item->{ author }) {
        my $author = $node->appendChild(
            XML::LibXML::Element->new('author')
        );
        my $name = $author->appendChild(
            XML::LibXML::Element->new('name')
        );
        $name->appendChild(
            XML::LibXML::Text->new($item->{ author })
        );
    }

    # TODO: Should this be mandatory?
    if (defined $item->{ updated } or defined $item->{ published }) {
        my $updated = $node->appendChild(
            XML::LibXML::Element->new('updated')
        );
        $updated->appendChild(
            XML::LibXML::Text->new(
                format_atom_time($item->{ updated } // $item->{ published })
            )
        );
    }

    if (defined $item->{ categories }) {
        for my $c (@{ $item->{ categories } }) {
            my $cat = $node->appendChild(
                XML::LibXML::Element->new('category')
            );
            $cat->setAttribute('term', $c);
        }
    }

    if (defined $item->{ rights }) {
        my $rights = $node->appendChild(
            XML::LibXML::Element->new('rights')
        );
        $rights->appendChild(
            XML::LibXML::Text->new($item->{ rights })
        );
    }

    return $node;

}

sub generate_rss_item {

    my ($item) = @_;

    my $node = XML::LibXML::Element->new('item');

    if (not defined $item->{ title }) {
        die "item missing title\n";
    }
    if (not defined $item->{ link }) {
        die "item missing link\n";
    }

    my $title = $node->appendChild(
        XML::LibXML::Element->new('title')
    );
    $title->appendChild(
        XML::LibXML::Text->new($item->{ title })
    );

    my $link = $node->appendChild(
        XML::LibXML::Element->new('link')
    );
    $link->appendChild(
        XML::LibXML::Text->new($item->{ link })
    );

    my $id = $node->appendChild(
        XML::LibXML::Element->new('guid')
    );
    $id->appendChild(
        XML::LibXML::Text->new(generate_item_id($item))
    );

    if (defined $item->{ published } or defined $item->{ updated }) {
        my $pub = $node->appendChild(
            XML::LibXML::Element->new('pubDate')
        );
        $pub->appendChild(
            XML::LibXML::Text->new(
                format_rss_time($item->{ published } // $item->{ updated })
            )
        );
    }

    if (defined $item->{ author }) {
        my $author = $node->appendChild(
            XML::LibXML::Element->new('author')
        );
        $author->appendChild(
            XML::LibXML::Text->new($item->{ author })
        );
    }

    if (defined $item->{ summary }) {
        my $desc = $node->appendChild(
            XML::LibXML::Element->new('description')
        );
        $desc->appendChild(
            XML::LibXML::Text->new($item->{ summary })
        );
    }

    if (defined $item->{ categories }) {
        for my $c (@{ $item->{ categories } }) {
            my $cat = $node->appendChild(
                XML::LibXML::Element->new('category')
            );
            $cat->appendChild(
                XML::LibXML::Text->new($c)
            );
        }
    }

    return $node;

}

sub generate_atom_feed {

    my ($feed) = @_;

    my $doc = XML::LibXML::Document->new();
    my $node = XML::LibXML::Element->new('feed');
    $node->setAttribute('xmlns', 'http://www.w3.org/2005/Atom');
    $doc->setDocumentElement($node);

    if (not defined $feed->{ title }) {
        die "feed is missing title\n";
    }
    if (not defined $feed->{ link }) {
        die "feed is missing link\n";
    }

    my $title = $node->appendChild(
        XML::LibXML::Element->new('title')
    );
    $title->appendChild(
        XML::LibXML::Text->new($feed->{ title })
    );

    my $link = $node->appendChild(
        XML::LibXML::Element->new('link')
    );
    $link->setAttribute('href', $feed->{ link });

    my $id = $node->appendChild(
        XML::LibXML::Element->new('id')
    );
    $id->appendChild(
        XML::LibXML::Text->new(generate_feed_id($feed))
    );

    # TODO: Updated/published should be mandatory
    my $updated = $node->appendChild(
        XML::LibXML::Element->new('updated')
    );
    $updated->appendChild(
        XML::LibXML::Text->new(
            format_atom_time($feed->{ updated } // $feed->{ published })
        )
    );

    if (defined $feed->{ description }) {
        my $desc = $node->appendChild(
            XML::LibXML::Element->new('subtitle')
        );
        $desc->appendChild(
            XML::LibXML::Text->new($feed->{ description })
        );
    }

    if (defined $feed->{ rights }) {
        my $rights = $node->appendChild(
            XML::LibXML::Element->new('rights')
        );
        $rights->appendChild(
            XML::LibXML::Text->new($feed->{ rights })
        );
    }

    if (defined $feed->{ categories }) {
        for my $c (@{ $feed->{ categories } }) {
            my $cat = $node->appendChild(
                XML::LibXML::Element->new('category')
            );
            $cat->setAttribute('term', $c);
        }
    }

    if (defined $feed->{ generator }) {
        my $gen = $node->appendChild(
            XML::LibXML::Element->new('generator')
        );
        $gen->appendChild(
            XML::LibXML::Text->new($feed->{ generator })
        );
    }

    if (defined $feed->{ items }) {
        for my $i (@{ $feed->{ items } }) {
            my $item = generate_atom_item($i);
            $node->appendChild($item);
        }
    }

    return $doc;

}

sub generate_rss_feed {

    my ($feed) = @_;

    my $doc = XML::LibXML::Document->new;
    my $node = XML::LibXML::Element->new('rss');
    $node->setAttribute('version', '2.0');
    $doc->setDocumentElement($node);
    my $channel = $node->appendChild(
        XML::LibXML::Element->new('channel')
    );

    if (not defined $feed->{ title }) {
        die "feed is missing title\n";
    }
    if (not defined $feed->{ link }) {
        die "feed is missing link\n";
    }

    my $title = $channel->appendChild(
        XML::LibXML::Element->new('title')
    );
    $title->appendChild(
        XML::LibXML::Text->new($feed->{ title })
    );

    my $link = $channel->appendChild(
        XML::LibXML::Element->new('link')
    );
    $link->appendChild(
        XML::LibXML::Text->new($feed->{ link })
    );

    my $desc = $channel->appendChild(
        XML::LibXML::Element->new('description')
    );
    $desc->appendChild(
        XML::LibXML::Text->new($feed->{ description } // '')
    );

    if (defined $feed->{ language }) {
        my $lang = $channel->appendChild(
            XML::LibXML::Element->new('language')
        );
        $lang->appendChild(
            XML::LibXML::Text->new($feed->{ language })
        );
    }

    if (defined $feed->{ rights }) {
        my $rights = $channel->appendChild(
            XML::LibXML::Element->new('copyright')
        );
        $rights->appendChild(
            XML::LibXML::Text->new($feed->{ rights })
        );
    }

    if (defined $feed->{ published }) {
        my $pub = $channel->appendChild(
            XML::LibXML::Element->new('pubDate')
        );
        $pub->appendChild(
            XML::LibXML::Text->new(format_rss_time($feed->{ published }))
        );
    }

    if (defined $feed->{ updated }) {
        my $updated = $channel->appendChild(
            XML::LibXML::Element->new('lastBuildDate')
        );
        $updated->appendChild(
            XML::LibXML::Text->new(format_rss_time($feed->{ updated }))
        );
    }

    if (defined $feed->{ categories }) {
        for my $c (@{ $feed->{ categories } }) {
            my $cat = $channel->appendChild(
                XML::LibXML::Element->new('category')
            );
            $cat->appendChild(
                XML::LibXML::Text->new($c)
            );
        }
    }

    if (defined $feed->{ generator }) {
        my $gen = $channel->appendChild(
            XML::LibXML::Element->new('generator')
        );
        $gen->appendChild(
            XML::LibXML::Text->new($feed->{ generator })
        );
    }

    if (defined $feed->{ ttl }) {
        if ($feed->{ ttl } !~ /^\d+$/) {
            die "channel ttl is not an integar\n";
        }
        my $ttl = $channel->appendChild(
            XML::LibXML::Element->new('ttl')
        );
        $ttl->appendChild(
            XML::LibXML::Text->new($feed->{ ttl })
        );
    }

    if (defined $feed->{ skip_hours }) {
        my $skip = $channel->appendChild(
            XML::LibXML::Element->new('skipHours')
        );
        for my $s (uniq sort { $a <=> $b } @{ $feed->{ skip_hours } }) {
            if ($s < 0 or $s > 23) {
                die "skip_hours contains invalid value ($s)\n";
            }
            if ($s !~ /^\d+$/) {
                die "skip_hours contains invalid value ($s)\n";
            }
            my $hour = $skip->appendChild(
                XML::LibXML::Element->new('hour')
            );
            $hour->appendChild(
                XML::LibXML::Text->new($s)
            );
        }
    }

    if (defined $feed->{ skip_days }) {
        my $skip = $channel->appendChild(
            XML::LibXML::Element->new('skipDays')
        );
        for my $s (uniq sort { $a <=> $b } @{ $feed->{ skip_days } }) {
            if (not exists $DAY_MAP{ $s }) {
                die "skip_days contains invalid value ($s)\n";
            }
            my $hour = $skip->appendChild(
                XML::LibXML::Element->new('day')
            );
            $hour->appendChild(
                XML::LibXML::Text->new($DAY_MAP{ $s })
            );
        }
    }

    if (defined $feed->{ items }) {
        for my $i (@{ $feed->{ items } }) {
            my $item = generate_rss_item($i);
            $channel->appendChild($item);
        }
    }

    return $doc;

}


sub process_config {

    my ($file) = @_;

    my $config = {
        Feed => {
            Title        => undef,
            Link         => undef,
            Description  => undef,
            Language     => undef,
            Rights       => undef,
            Published    => undef,
            Updated      => undef,
            Categories   => undef,
            Generator    => undef,
            TTL          => undef,
            SkipHours    => undef,
            SkipDays     => undef,
            Format       => undef,
            PublishedFmt => undef,
            UpdatedFmt   => undef,
        },
        Item => {
            Select       => undef,
            Title        => undef,
            Link         => undef,
            Published    => undef,
            Updated      => undef,
            Author       => undef,
            Content      => undef,
            Categories   => undef,
            Rights       => undef,
            PublishedFmt => undef,
            UpdatedFmt   => undef,
        },
    };

    my $hash = read_config($file);

    if (not exists $hash->{ Feed }) {
        die "$file missing [Feed] section\n";
    }
    if (not exists $hash->{ Item }) {
        die "$file missing [Item] section\n";
    }

    # TODO: Are there sane defaults we can default to for these?
    if (not exists $hash->{ Feed }{ Title }) {
        die "$file missing [Feed].Title configuration\n";
    }
    if (not exists $hash->{ Feed }{ Link }) {
        die "$file missing [Feed].Link configuration\n";
    }
    if (not exists $hash->{ Item }{ Select }) {
        die "$file missing [Item].Select configuration\n";
    }
    if (not exists $hash->{ Item }{ Title }) {
        die "$file missing [Item].Title configuration\n";
    }
    if (not exists $hash->{ Item }{ Link }) {
        die "$file missing [Item].Link configuration\n";
    }

    $config->{ Feed }{ Title }  = [ id_selector($hash->{ Feed }{ Title }) ];
    $config->{ Feed }{ Link }   = (id_selector($hash->{ Feed }{ Link }))[0];
    $config->{ Item }{ Select } = (id_selector($hash->{ Item }{ Select }))[0];
    $config->{ Item }{ Title }  = [ id_selector($hash->{ Item }{ Title }) ];
    $config->{ Item }{ Link }   = [ id_selector($hash->{ Item }{ Link }) ];

    if (exists $hash->{ Feed }{ Description }) {
        $config->{ Feed }{ Description } = [
            id_selector($hash->{ Feed }{ Description })
        ];
    }

    if (exists $hash->{ Feed }{ Language }) {
        $config->{ Feed }{ Language } = [
            id_selector($hash->{ Feed }{ Language })
        ];
    }

    if (exists $hash->{ Feed }{ Rights }) {
        $config->{ Feed }{ Rights } = [
            id_selector($hash->{ Feed }{ Rights })
        ];
    }

    if (exists $hash->{ Feed }{ Published }) {
        $config->{ Feed }{ Published } = [
            id_selector($hash->{ Feed }{ Published })
        ];
    }

    if (exists $hash->{ Feed }{ Updated }) {
        $config->{ Feed }{ Updated } = [
            id_selector($hash->{ Feed }{ Updated })
        ];
    }

    if (exists $hash->{ Feed }{ Categories }) {
        $config->{ Feed }{ Categories } = [
            id_selector($hash->{ Feed }{ Categories })
        ];
    }

    if (exists $hash->{ Feed }{ Generator }) {
        $config->{ Feed }{ Generator } = [
            id_selector($hash->{ Feed }{ Generator })
        ];
    }

    if (exists $hash->{ Feed }{ TTL }) {
        if ($hash->{ Feed }{ TTL } !~ /^\d+$/) {
            die "TTL must be an integar\n";
        }
        $config->{ Feed }{ TTL } = $hash->{ Feed }{ TTL };
    }

    if (exists $hash->{ Feed }{ SkipHours }) {
        my @vals = split /\s*,\s*/, $hash->{ Feed }{ SkipHours };
        for my $v (@vals) {
            if ($v !~ /^\d+$/ or ($v < 0 or $v > 23)) {
                die "SkipHours can only contain integars between 0 and 23\n";
            }
            push @{ $config->{ Feed }{ SkipHours } }, $v;
        }
    }

    if (exists $hash->{ Feed }{ SkipDays }) {
        my @vals = split /\s*,\s*/, $hash->{ Feed }{ SkipDays };
        for my $v (@vals) {
            my $vv = lc $v;
            if (not exists $DAY_STRINGS{ $vv }) {
                die "SkipHours contains invalid value ($v)\n";
            }
            push @{ $config->{ Feed }{ SkipDays } }, $DAY_STRINGS{ $vv };
        }
    }

    if (exists $hash->{ Feed }{ Format }) {
        $config->{ Feed }{ Format } = lc $hash->{ Feed }{ Format };
        if (not $FEED_FORMATS{ $config->{ Feed }{ Format } }) {
            die "'$hash->{ Feed }{ Format }' is not a valid feed format\n";
        }
    } else {
        $config->{ Feed }{ Format } = 'atom';
    }

    if (exists $hash->{ Feed }{ PublishedFmt }) {
        $config->{ Feed }{ PublishedFmt } = $hash->{ Feed }{ PublishedFmt };
    }

    if (exists $hash->{ Feed }{ UpdatedFmt }) {
        $config->{ Feed }{ UpdatedFmt } = $hash->{ Feed }{ UpdatedFmt };
    }


    if (exists $hash->{ Item }{ Published }) {
        $config->{ Item }{ Published } = [
            id_selector($hash->{ Item }{ Published })
        ];
    }

    if (exists $hash->{ Item }{ Updated }) {
        $config->{ Item }{ Updated } = [
            id_selector($hash->{ Item }{ Updated })
        ];
    }

    if (exists $hash->{ Item }{ Author }) {
        $config->{ Item }{ Author } = [
            id_selector($hash->{ Item }{ Author })
        ];
    }

    if (exists $hash->{ Item }{ Content }) {
        $config->{ Item }{ Content } = [
            id_selector($hash->{ Item }{ Content })
        ];
    }

    if (exists $hash->{ Item }{ Categories }) {
        $config->{ Item }{ Categories } = [
            id_selector($hash->{ Item }{ Categories })
        ];
    }

    if (exists $hash->{ Item }{ Rights }) {
        $config->{ Item }{ Rights } = [
            id_selector($hash->{ Item }{ Rights })
        ];
    }

    if (exists $hash->{ Item }{ PublishedFmt }) {
        $config->{ Item }{ PublishedFmt } = $hash->{ Item }{ PublishedFmt };
    }

    if (exists $hash->{ Item }{ UpdatedFmt }) {
        $config->{ Item }{ UpdatedFmt } = $hash->{ Item }{ UpdatedFmt };
    }

    if (defined $config->{ Feed }{ Published }) {
        if (
            $config->{ Feed }{ Published }[0] eq 'updated' and
            $config->{ Feed }{ Published }[1] == SELECTOR_STRING
        ) {
            if (not defined $config->{ Feed }{ Updated }) {
                die "published set to 'updated', but 'updated' not defined\n";
            }
            if (
                $config->{ Feed }{ Updated }[0] eq 'published' and
                $config->{ Feed }{ Published }[1] == SELECTOR_STRING
            ) {
                die "published cannot be set to 'updated' if updated is set to 'published'\n";
            }
        }
        if (
            $config->{ Feed }{ Published }[0] eq 'published' and
            $config->{ Feed }{ Published }[1] == SELECTOR_STRING
        ) {
            die "published cannot be set to 'published'\n";
        }
    }

    if (defined $config->{ Feed }{ Updated }) {
        if (
            $config->{ Feed }{ Updated }[0] eq 'updated' and
            $config->{ Feed }{ Updated }[1] == SELECTOR_STRING
        ) {
            die "updated cannot be set to 'updated'\n";
        }
    }

    return $config;

}

sub xpath_select {

    my ($node, $selector) = @_;

    if ($selector->[1] == SELECTOR_STRING) {
        return $selector->[0];
    } elsif ($selector->[1] == SELECTOR_SINGLE_TEXT) {
        my ($n) = $node->findnodes($selector->[0]);
        if (not defined $n) {
            die "Nothing matches '$selector->[0]'\n";
        }
        if ($n->isa('XML::LibXML::Attr')) {
            return $n->value // '';
        }
        return $n->textContent;
    } elsif ($selector->[1] == SELECTOR_MULTI_TEXT) {
        my @found = $node->findnodes($selector->[0]);
        if (!@found) {
            die "Nothing matches '$selector->[0]'\n";
        }
        my $text = '';
        for my $n (@found) {
            if ($n->isa('XML::LibXML::Attr')) {
                $text .= $n->value // '';
            } else {
                $text .= $n->textContent;
            }
        }
        return $text;
    } elsif ($selector->[1] == SELECTOR_SINGLE_HTML) {
        my ($n) = $node->findnodes($selector->[0]);
        if (not defined $n) {
            die "Nothing matches '$selector->[0]'\n";
        }
        if ($n->isa('XML::LibXML::Attr')) {
            die "Cannot select the HTML of an attribute node\n";
        }
        return $n->toString;
    } elsif ($selector->[1] == SELECTOR_MULTI_HTML) {
        my @found = $node->findnodes($selector->[1]);
        if (!@found) {
            die "Nothing matches '$selector->[0]'\n";
        }
        my $text = '';
        for my $n (@found) {
            if ($n->isa('XML::LibXML::Attr')) {
                die "Cannot select the HTML of an attribute node\n";
            }
            $text .= $n->toString;
        }
        return $text;
    }

    die "Invalid selector type";

}

sub xpath_multi_select {

    my ($dom, $selector) = @_;

    if ($selector->[1] == SELECTOR_STRING) {
        return ($selector->[0]);
    } elsif ($selector->[1] == SELECTOR_SINGLE_TEXT or $selector->[1] == SELECTOR_MULTI_TEXT) {
        my @found = $dom->findnodes($selector->[0]);
        return map {
            $_->isa('XML::LibXML::Attr') ? $_->value // '' : $_->textContent
        } @found;
    } elsif ($selector->[1] == SELECTOR_SINGLE_HTML or $selector->[1] == SELECTOR_MULTI_HTML) {
        my @found = $dom->findnodes($selector->[1]);
        @found = grep { !$_->isa('XML::LibXML::Attr') } @found;
        return map { $_->toString } @found;
    }

    die "Invalid selector type";

}

sub html2feed {

    my ($html, $config) = @_;

    my $feed = {
        title       => undef,
        link        => undef,
        updated     => undef,
        published   => undef,
        description => undef,
        language    => undef,
        rights      => undef,
        categories  => undef,
        generator   => undef,
        ttl         => undef,
        skip_hours  => undef,
        skip_days   => undef,
        items       => [],
    };

    my $dom = XML::LibXML->load_html(
        location => $html,
        recover => 2,
    );

    $feed->{ title } = xpath_select($dom, $config->{ Feed }{ Title });
    $feed->{ link } = $config->{ Feed }{ Link };

    if (defined $config->{ Feed }{ Published }) {
        if ($config->{ Feed }{ Published }[1] == SELECTOR_STRING) {
            my $p = $config->{ Feed }{ Published }[0];
            if ($p eq 'now') {
                $feed->{ published } = $NOW->epoch;
            } elsif ($p eq 'today') {
                $feed->{ published } = $NOW->truncate(to => 'day')->epoch;
            } elsif ($p eq 'week') {
                $feed->{ published } = truncate_week($NOW)->epoch;
            } elsif ($p eq 'month') {
                $feed->{ published } = $NOW->truncate(to => 'month')->epoch;
            } elsif ($p eq 'year') {
                $feed->{ published } = $NOW->truncate(to => 'year')->epoch;
            } elsif ($p eq 'updated') {
                if (not defined $config->{ Feed }{ Updated }) {
                    die "Cannot set published to 'updated': [Feed].Updated not configured\n";
                }
                # Set this later...
            } elsif ($p eq 'published') {
                die "Cannot set published to 'published'\n";
            } elsif (defined $config->{ Feed }{ PublishedFmt }) {
                my $t = Time::Piece->strptime($p, $config->{ Feed }{ PublishedFmt });
                $feed->{ published } = $t->epoch;
            } else {
                die "Cannot set [Feed].Published to a string without setting PublishedFmt\n";
            }
        } else {
            if (not defined $config->{ Feed }{ PublishedFmt }) {
                die "Cannot determine publish time; [Feed].PublishedFmt not configured\n";
            }
            my $sel = xpath_select($dom, $config->{ Feed }{ Published });
            my $t = Time::Piece->strptime($sel, $config->{ Feed }{ PublishedFmt });
            $feed->{ published } = $t->epoch;
        }
    }

    if (defined $config->{ Feed }{ Updated }) {
        if ($config->{ Feed }{ Updated }[1] == SELECTOR_STRING) {
            my $p = $config->{ Feed }{ Updated }[0];
            if ($p eq 'now') {
                $feed->{ updated } = $NOW->epoch;
            } elsif ($p eq 'today') {
                $feed->{ updated } = $NOW->truncate(to => 'day')->epoch;
            } elsif ($p eq 'week') {
                $feed->{ updated } = truncate_week($NOW)->epoch;
            } elsif ($p eq 'month') {
                $feed->{ updated } = $NOW->truncate(to => 'month')->epoch;
            } elsif ($p eq 'year') {
                $feed->{ updated } = $NOW->truncate(to => 'year')->epoch;
            } elsif ($p eq 'updated') {
                die "Cannot set updated to 'updated'\n";
            } elsif ($p eq 'published') {
                if (not defined $feed->{ published }) {
                    die "Cannot set updated to 'published': published is not set\n";
                }
                $feed->{ updated } = $feed->{ published };
            } elsif (defined $config->{ Feed }{ UpdatedFmt }) {
                my $t = Time::Piece->strptime($p, $config->{ Feed }{ UpdatedFmt });
                $feed->{ updated } = $t->epoch;
            } else {
                die "Cannot set [Feed].Updated to a string without setting UpdatedFmt\n";
            }
        } else {
            if (not defined $config->{ Feed }{ UpdatedFmt }) {
                die "Cannot determine publish time; [Feed].UpdatedFmt not configured\n";
            }
            my $sel = xpath_select($dom, $config->{ Feed }{ Updated });
            my $t = Time::Piece->strptime($sel, $config->{ Feed }{ UpdatedFmt });
            $feed->{ updated } = $t->epoch;
        }
    }

    if (
        defined $config->{ Feed }{ Published } and
        $config->{ Feed }{ Published }[1] == SELECTOR_STRING and
        $config->{ Feed }{ Published }[0] eq 'updated'
    ) {
        $feed->{ published } = $feed->{ updated };
    }

    if (defined $config->{ Feed }{ Description }) {
        $feed->{ description } = xpath_select($dom, $config->{ Feed }{ Description });
    }

    if (defined $config->{ Feed }{ Language }) {
        $feed->{ language } = xpath_select($dom, $config->{ Feed }{ Language });
    }

    if (defined $config->{ Feed }{ Rights }) {
        $feed->{ Rights } = xpath_select($dom, $config->{ Feed }{ Rights });
    }

    if (defined $config->{ Feed }{ Categories }) {
        if ($config->{ Feed }{ Categories }[1] == SELECTOR_STRING) {
            my @cats = split /\s*,\s*/, $config->{ Feed }{ Categories }[0];
            $feed->{ categories } = \@cats;
        } else {
            $feed->{ categories } = [
                xpath_multi_select($dom, $config->{ Feed }{ Categories })
            ];
        }
    }

    if (defined $config->{ Feed }{ Generator }) {
        $feed->{ generator } = xpath_select($dom, $config->{ Feed }{ Generator });
    }

    if (defined $config->{ Feed }{ TTL }) {
        $feed->{ ttl } = $config->{ Feed }{ TTL };
    }

    if (defined $config->{ Feed }{ SkipHours }) {
        $feed->{ skip_hours } = $config->{ Feed }{ SkipHours };
    }

    if (defined $config->{ Feed }{ SkipDays }) {
        $feed->{ skip_days } = $config->{ Feed }{ SkipDays };
    }

    my @found = $dom->findnodes($config->{ Item }{ Select });
    for my $n (@found) {
        my $item = {
            title        => undef,
            link         => undef,
            published    => undef,
            updated      => undef,
            author       => undef,
            summary      => undef,
            summary_html => 0,
            categories   => undef,
            rights       => undef,
        };
        $item->{ title } = xpath_select($n, $config->{ Item }{ Title });
        $item->{ link } = xpath_select($n, $config->{ Item }{ Link });
        if (defined $config->{ Item }{ Published }) {
            if ($config->{ Item }{ Published }[1] == SELECTOR_STRING) {
                my $p = $config->{ Item }{ Published }[0];
                if ($p eq 'now') {
                    $item->{ published } = $NOW->epoch;
                } elsif ($p eq 'today') {
                    $item->{ published } = $NOW->truncate(to => 'day')->epoch;
                } elsif ($p eq 'week') {
                    $item->{ published } = truncate_week($NOW)->epoch;
                } elsif ($p eq 'month') {
                    $item->{ published } = $NOW->truncate(to => 'month')->epoch;
                } elsif ($p eq 'year') {
                    $item->{ published } = $NOW->truncate(to => 'year')->epoch;
                } elsif ($p eq 'updated') {
                    if (not defined $config->{ Item }{ Updated }) {
                        die "Cannot set published to 'updated': [Item].Updated not set\n";
                    }
                    # Set this later...
                } elsif ($p eq 'published') {
                    die "Cannot set published to 'published'\n";
                } elsif (defined $config->{ Item }{ PublishedFmt }) {
                    my $t = Time::Piece->strptime($p, $config->{ Item }{ PublishedFmt });
                    $item->{ published } = $t->epoch;
                } else {
                    die "Cannot set [Item].Published to string without setting PublishedFmt\n";
                }
            } else {
                if (not defined $config->{ Item }{ PublishedFmt }) {
                    die "Cannot determine publish time; [Item].PublishedFmt not configured\n";
                }
                my $sel = xpath_select($dom, $config->{ Item }{ Published });
                my $t = Time::Piece->strptime($sel, $config->{ Item }{ PublishedFmt });
                $item->{ published } = $t->epoch;
            }
        }
        if (defined $config->{ Item }{ Updated }) {
            if ($config->{ Item }{ Updated }[1] == SELECTOR_STRING) {
                my $p = $config->{ Item }{ Updated }[0];
                if ($p eq 'now') {
                    $item->{ updated } = $NOW->epoch;
                } elsif ($p eq 'today') {
                    $item->{ updated } = $NOW->truncate(to => 'day')->epoch;
                } elsif ($p eq 'week') {
                    $item->{ updated } = truncate_week($NOW)->epoch;
                } elsif ($p eq 'month') {
                    $item->{ updated } = $NOW->truncate(to => 'month')->epoch;
                } elsif ($p eq 'year') {
                    $item->{ updated } = $NOW->truncate(to => 'year')->epoch;
                } elsif ($p eq 'updated') {
                    die "Cannot set updated to 'updated'\n";
                } elsif ($p eq 'published') {
                    if (not defined $item->{ published }) {
                        die "Cannot set updated to 'published': [Item].published is not set\n";
                    }
                    $item->{ updated } = $item->{ published };
                } elsif (defined $config->{ Item }{ UpdatedFmt }) {
                    my $t = Time::Piece->strptime($p, $config->{ Item }{ UpdatedFmt });
                    $item->{ updated } = $t->epoch;
                } else {
                    die "Cannot set [Item].Updated to string without setting UpdatedFmt\n";
                }
            } else {
                if (not defined $config->{ Item }{ UpdatedFmt }) {
                    die "Cannot determine publish time; [Item].UpdatedFmt not configured\n";
                }
                my $sel = xpath_select($dom, $config->{ Item }{ Updated });
                my $t = Time::Piece->strptime($sel, $config->{ Item }{ UpdatedFmt });
                $item->{ updated } = $t->epoch;
            }
        }
        if (
            defined $config->{ Item }{ Published } and
            $config->{ Item }{ Published }[1] == SELECTOR_STRING and
            $config->{ Item }{ Published }[0] eq 'updated'
        ) {
            $item->{ published } = $item->{ updated };
        }
        if (defined $config->{ Item }{ Author }) {
            $item->{ author } = xpath_select($n, $config->{ Item }{ Author });
        }
        if (defined $config->{ Item }{ Content }) {
            $item->{ summary } = xpath_select($n, $config->{ Item }{ Content });
            if (
                $config->{ Item }{ Content }->[1] == SELECTOR_SINGLE_HTML or
                $config->{ Item }{ Content } == SELECTOR_MULTI_HTML
            ) {
                $item->{ summary_html } = 1;
            }
        }
        if (defined $config->{ Item }{ Categories }) {
            if ($config->{ Item }{ Categories }[1] == SELECTOR_STRING) {
                my @cats = split /\s*,\s*/, $config->{ Item }{ Categories }[0];
                $item->{ categories } = \@cats;
            } else {
                $item->{ categories } = [
                    xpath_multi_select($n, $config->{ Item }{ Categories })
                ];
            }
        }
        if (defined $config->{ Item }{ Rights }) {
            $item->{ rights } = xpath_select($n, $config->{ Item }{ Rights });
        }
        push @{ $feed->{ items } }, $item;
    }

    my @times =
        grep { defined }
        map { @{ $_ }{ qw(published updated) } }
        @{ $feed->{ items } };
    if (not defined $feed->{ published }) {
        $feed->{ published } = min @times;
    }
    if (not defined $feed->{ updated }) {
        $feed->{ updated } = max @times;
    }

    return $feed;

}

GetOptions(
    'a'   => \my $force_atom,
    'r'   => \my $force_rss,
    'o=s' => \my $output,
    'h'   => sub { print $USAGE; exit 0 },
) or die $USAGE;

my ($config_file, $html) = @ARGV;

if (not defined $config_file) {
    die $USAGE;
}

my $config = process_config($config_file);
if (not defined $html) {
    $html = $config->{ Feed }{ Link };
    if (not defined $html) {
        die "html2rss.pl requires an HTML file/page supplied to it either via a command-line argument or the [Feed].Link configuration field\n";
    }
}

my $file;
if ($html =~ /^\w+:\/\//) {
    my $tmp = do {
        my ($h, $p) = tempfile(UNLINK => 1);
        close $h;
        $p;
    };
    curl_to($html, $tmp, user_agent => $DEFAULT_AGENT);
    $file = $tmp;
} else {
    $file = $html;
}

my $feed = html2feed($file, $config);
my $feed_xml;

if ($force_atom) {
    $feed_xml = generate_atom_feed($feed);
} elsif ($force_rss) {
    $feed_xml = generate_rss_feed($feed);
} elsif (defined $config->{ Feed }{ Format }) {
    if ($config->{ Feed }{ Format } eq 'atom') {
        $feed_xml = generate_atom_feed($feed);
    } elsif ($config->{ Feed }{ Format } eq 'rss') {
        $feed_xml = generate_rss_feed($feed);
    } else {
        die "'$config->{ Feed }{ Format }' is not a valid feed format\n";
    }
} else {
    $feed_xml = generate_atom_feed($feed);
}

if (not defined $output) {
    binmode *STDOUT;
    $feed_xml->toFH(*STDOUT, 2);
} else {
    $feed_xml->toFile($output, 2);
    say "Wrote feed to $output";
}

=head1 NAME

html2rss.pl - Convert HTML pages to RSS feeds

=head1 USAGE

  html2rss.pl [options] config [location]

=head1 DESCRIPTION

B<html2rss.pl> is a Perl script that converts HTML pages to RSS feeds based on
parameters read from a given feed configuration file.

B<html2rss.pl> takes a configuration file as input, whose format is described
in the subsequent section. B<html2rss.pl> can also optionally take either a
path to an HTML file or a URL to manually specify the HTML file/page to
convert. If no location is given, B<html2rss.pl> will convert the URL/path
set as the feed link in the configuration file.

=head1 CONFIGURATION

B<html2rss.pl> processes HTML pages based on parameters it reads from
configuration files. B<html2rss.pl> uses a configuration format similar to
the INI file format. A configuration file consists of sections marked with
their name enclosed in square brackets.

  # Section named "Feed"
  [Feed]

Sections contain lists of configuration options which are lines of
key-value pairs that are seperated by an equals sign.

  [Feed]
  Title = Yadda yadda...
  Link = https://phony.com/
  Description = [/html/body/div[1]/p]

Lines starting with a hash are comments and will be ignored by
B<html2rss.pl>.

B<html2rss.pl> requires two sections: C<[Feed]> and C<[Item]>.

=head2 Selectors

Many configuration options can take an XPath selector statement as a value. A
selector statement will select elements from the HTML's DOM tree and use the
captured text as the value for the configuration field. Selector statements
are enclosed in either square or angle brackets. A single pair of brackets
will capture only the first selected element. A double pair of brackets will
capture the contents of all matching elements.

=over 4

=item [I<selector>]

=item [[I<selector>]]

Capture the text contents of the selected elements.

=item <I<selector>>

=item <<I<selector>>>

Capture the serialized HTML of the selected elements.

=back

=head2 [Feed] options

=over 4

=item Title = I<string> | I<selector>

The string or element selector for the feed's title. B<This is a required
field>.

=item Link = I<url>

The HTML page's URL. B<This is a required field>. B<This is a required field>.

=item Description = I<string> | I<selector>

The string or element selector for the feed's description.

=item Language = I<string> | I<selector>

The string or element selector for the feed's language.

=item Rights = I<string> | I<selector>

The string or element selector for the feed's rights disclaimer.

=item Published = I<time> | I<selector>

The string or element selector for the feed's published date. See the
L</Special Time Options> section below for a list of special string options that
can be used for this field. If this field is set to a non-special string or
selector, B<html2rss.pl> will read the string/captured text as a timestamp in
the format specified by C<PublishedFmt>. If C<PublishedFmt> is not set, an
error will be raised.

=item Updated = I<time> | I<selector>

Same as C<Published>, but for a post's update time.

=item Categories = I<tags> | I<selector>

Comma-seperated list of tags or element selector for a feed's category list.
If a selector is used, the selector will use the captured contents of each
element (regardless of whether it is a single selector or multi selector) as
seperate categories.

=item Generator = I<string> | I<selector>

The string or element for the feed's generator.

=item TTL = I<minutes>

Number of minutes to use for the feed's TTL.

=item SkipHours = I<hours>

Comma-seperated list of hours (C<0>-C<23>) to use for the feed's
C<skipHours> field.

=item SkipDays = I<days>

Comma-seperated list of days to use for the feed's C<skipDays> field. A day can
be a week day's numerical value (C<0> for Sunday, C<1> for Monday, ...), the
abbreviated name of the week day (C<sun>, C<mon>, ...), or the week day's full
name (C<sunday>, C<monday>, ...). Case does not matter.

=item Format = I<format>

Default format to use for the generated feed. Can either be C<atom> or C<rss>.
C<atom> is the default.

=item PublishedFmt = I<fmt>

The L<striptime(3)> format string to use for parsing C<Published> timestamps.

=item UpdatedFmt = I<fmt>

Same as above, but for C<Updated> timestamps.

=back

=head2 [Item] options

=over 4

=item Select = I<selector>

Element selector that will be used to select each element that will be
converted into a feed entry. Each selected element will be used as the base
element for other C<[Item]> selector statements, so that you can use the
C<./...> syntax to select relative to that element. B<This is a required
field>.

=item Title = I<string> | I<selector>

The string or element selector for the item's title. B<This is a required
field>.

=item Link = I<url> | I<selector>

The URL or element selector for the item's link. B<This is a required field>.

=item Published = I<time> | I<selector>

The string or element selector for the item's published date. See the
L</Special Time Options> section below for a list of special string options that
can be used for this field. If this field is set to a non-special string or
selector, B<html2rss.pl> will read the string/captured text as a timestamp in
the format specified by C<PublishedFmt>. If C<PublishedFmt> is not set, an
error will be raised.

=item Updated = I<time> | I<selector>

Same as above, but for the C<Updated> field.

=item Author = I<string> | I<selector>

The string or element selector for the item's author.

=item Content = I<string> | I<selector>

The string or element selector for the item's content.

=item Categories = I<tags> | I<selector>

Comma-seperated list or element selector for the item's category list.
If a selector is used, the selector will use the captured contents of each
element (regardless of whether it is a single selector or multi selector) as
seperate categories.

=item Rights = I<string> | I<selector>

The string or element selector for the item's rights.

=item PublishedFmt = I<fmt>

The L<strftime(3)> format to use for timestamps parsed in the C<Published>
field.

=item UpdatedFmt = I<fmt>

The same as above, but for the C<Updated> field.

=back

=head3 Special Time Options

=over 4

=item now

Set time to time the script was ran.

=item today

Set time to the beginning of today.

=item week

Set time to the beginning of the week.

=item month

Set time to the beginning of the month.

=item year

Set time to the beginning of the year.

=item published

Set time to the time of the feed/item's C<Published> field. Only works for the
C<Updated> fields.

=item updated

Set time to the time of the feed/item's C<Updated> field. Only works for the
C<Published> fields.

=back

=head1 OPTIONS

=over 4

=item B<-a>

Generate an Atom feed.

=item B<-r>

Generate an RSS 2.0 feed.

=item B<-o> I<file>

Output the generated feed to the specified file. If not set, the feed will
be written to standard output.

=item B<-h>

Print the script's usage message.

=back

=head1 AUTHOR

Written by Samuel Young, E<lt>samyoung12788@gmail.comE<gt>.

This project's source can be found on its
L<Codeberg page|https://codeberg.org/1-1sam/noss.git>. Comments and pull
requests are welcome!

=head1 COPYRIGHT

Copyright (C) 2025 Samuel Young

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

=head1 SEE ALSO

L<noss(1)>, L<strftime(3)>

=cut
