#!/usr/bin/perl -w

use strict;
use diagnostics;
use XML::XPath;
use XML::RSS;
use LWP::UserAgent;
use Date::Manip;

#if ($#ARGV != 2) {
#  print "\n perl xpath2rss0.1.pl <page url> <save path> \n\n";
#  exit(0);
#}

my $page_url = $ARGV[0];
my $save_path = $ARGV[1];

# get today's date
# my $today=&ParseDate("today");
my $today = `date "+%Y-%m-%dT%H:%M:%S"`;
log3("today is $today");

my $ua = LWP::UserAgent->new();
log3("initializing ua");

my $request = HTTP::Request->new('GET', $page_url);
log3("sending request");

my $response = $ua->simple_request($request);
log3("fetching response");

my $xp = XML::XPath->new(xml => $response->content) if $response->is_success;

my $channel_title = $xp->findvalue('//meta[@name="DC.Title"]/@content');
log3("XML::XPath::XMLParser::as_string($channel_title)");

my $channel_link = $xp->findvalue('//link[@rel="home"]/@href');
log3("XML::XPath::XMLParser::as_string($channel_link)");

my $channel_description = $xp->findvalue('//meta[@name="DC.Description"]/@content');
log3("XML::XPath::XMLParser::as_string($channel_description)");

my $channel_subject = $xp->findvalue('//meta[@name="DC.Subject"]/@content');
log3("XML::XPath::XMLParser::as_string($channel_subject)");

my $channel_creator = $xp->findvalue('//meta[@name="DC.Creator"]/@content');
log3("XML::XPath::XMLParser::as_string($channel_creator)");

my $channel_publisher = $xp->findvalue('//meta[@name="DC.Publisher"]/@content');
log3("XML::XPath::XMLParser::as_string($channel_publisher)");

my $channel_rights = $xp->findvalue('//meta[@name="DC.Rights"]/@content');
log3("XML::XPath::XMLParser::as_string($channel_rights)");

my $rss = new XML::RSS (version => '1.0',
			encoding => 'UTF-8');

$rss->channel(title          => $channel_title,
	      link           => $channel_link,
	      description    => $channel_description,
 	      dc => {
	      	date           => $today,
	      	subject        => $channel_subject,
	     	creator	       => $channel_creator,
	      	publisher      => $channel_publisher,
	      	rights         => $channel_rights,
		language       => 'en-us',
	      	docs           => 'http://purl.org/rss/1.0/',
		 }
	      );

my $path=q{//div[@class="blog:post"]};

my $nodeset = $xp->find($path); # find all top level items

foreach my $post ($nodeset->get_nodelist) {
	my $content = "";
	my $content_desc = "";
	my $alt_title = "";
	my $title = "";
	my $permalink = $xp->findvalue('./span[@class="content"]/span[@class="permalink"]/a/@href', $post);
        log3("$permalink");
	my $itemset = $xp->find('.//div[@class="rss:item"]', $post); # find all top level items
	foreach my $item ($itemset->get_nodelist) {
		$content = $xp->findnodes_as_string('./p[@class="content"]', $item);
		$content =~s/\r//gsi;        # delete carraige returns - abk 0926
		$content =~s/\s*\n\s*/ /gsi; # replace returns w/ space
		$content =~ s/<[^<>]*>/ /gsi; #strip html newer old
                my $contentlen = length($content);
                log3($contentlen);
		#my $sentences=get_sentences($content);	## Get the sentences
		#foreach my $sentence (@$sentences) {
		#	$alt_title .= $sentence if (length($alt_title) + length($sentence)) < 100;
		#	$content_desc .= $sentence if (length($content_desc) + length($sentence)) < 500;
		#}
		$alt_title = substr( $content, 1, 100);
		$content_desc = substr( $content, 1, 500);
                if ($contentlen > 500) {
                        $content_desc .= "..."
                }
		if ($xp->find('boolean(./span[@class="rss:title"])', $item)) {
			log3("yes, there's a title.");
			$title = $xp->findnodes_as_string('./span[@class="rss:title"]', $item);
		}
		else {
			$title = $alt_title;
		}

		$rss->add_item( title => clean_text($title),
		       		link  => $permalink,
				description => clean_text($content_desc)
		);
	}
}

$rss->save($save_path);

sub clean_text {
	my($text) = @_;

	$text =~s/\r//gsi;        # delete carraige returns - abk 0926
	$text =~s/\s*\n\s*/ /gsi; # replace returns w/ space
	$text =~ s/<[^<>]*>/ /gsi; #strip html newer old

	$text =~ s/^\s*//;  # strip leading spaces
	$text =~ s/\s*$//;  # strip trailing spaces
	$text =~ s/\s+/ /g; # strip multiple spaces

	return $text;
}

sub log1 {
  # WARN
  my $logmsg = shift;
#  return unless VERBOSE >= 1;
  print STDERR "WARN: $logmsg\n";
}

sub log2 {
  # INFO
  my $logmsg = shift;
#  return unless VERBOSE >= 2;
  print STDERR "INFO: $logmsg\n";
}

sub log3 {
  # DBUG
  my $logmsg = shift;
#  return unless VERBOSE >= 3;
  print STDERR "DBUG: $logmsg\n";
}