#!/usr/bin/perl -w use strict; use diagnostics; use XML::XPath; use XML::RSS; use LWP::UserAgent; use Date::Manip; #if ($#ARGV != 2) { # print "\n perl xpath2rss0.1.pl \n\n"; # exit(0); #} my $page_url = $ARGV[0]; my $save_path = $ARGV[1]; # get today's date # my $today=&ParseDate("today"); my $today = `date "+%Y-%m-%dT%H:%M:%S"`; log3("today is $today"); my $ua = LWP::UserAgent->new(); log3("initializing ua"); my $request = HTTP::Request->new('GET', $page_url); log3("sending request"); my $response = $ua->simple_request($request); log3("fetching response"); my $xp = XML::XPath->new(xml => $response->content) if $response->is_success; my $channel_title = $xp->findvalue('//meta[@name="DC.Title"]/@content'); log3("XML::XPath::XMLParser::as_string($channel_title)"); my $channel_link = $xp->findvalue('//link[@rel="home"]/@href'); log3("XML::XPath::XMLParser::as_string($channel_link)"); my $channel_description = $xp->findvalue('//meta[@name="DC.Description"]/@content'); log3("XML::XPath::XMLParser::as_string($channel_description)"); my $channel_subject = $xp->findvalue('//meta[@name="DC.Subject"]/@content'); log3("XML::XPath::XMLParser::as_string($channel_subject)"); my $channel_creator = $xp->findvalue('//meta[@name="DC.Creator"]/@content'); log3("XML::XPath::XMLParser::as_string($channel_creator)"); my $channel_publisher = $xp->findvalue('//meta[@name="DC.Publisher"]/@content'); log3("XML::XPath::XMLParser::as_string($channel_publisher)"); my $channel_rights = $xp->findvalue('//meta[@name="DC.Rights"]/@content'); log3("XML::XPath::XMLParser::as_string($channel_rights)"); my $rss = new XML::RSS (version => '1.0', encoding => 'UTF-8'); $rss->channel(title => $channel_title, link => $channel_link, description => $channel_description, dc => { date => $today, subject => $channel_subject, creator => $channel_creator, publisher => $channel_publisher, rights => $channel_rights, language => 'en-us', docs => 'http://purl.org/rss/1.0/', } ); my $path=q{//div[@class="blog:post"]}; my $nodeset = $xp->find($path); # find all top level items foreach my $post ($nodeset->get_nodelist) { my $content = ""; my $content_desc = ""; my $alt_title = ""; my $title = ""; my $permalink = $xp->findvalue('./span[@class="content"]/span[@class="permalink"]/a/@href', $post); log3("$permalink"); my $itemset = $xp->find('.//div[@class="rss:item"]', $post); # find all top level items foreach my $item ($itemset->get_nodelist) { $content = $xp->findnodes_as_string('./p[@class="content"]', $item); $content =~s/\r//gsi; # delete carraige returns - abk 0926 $content =~s/\s*\n\s*/ /gsi; # replace returns w/ space $content =~ s/<[^<>]*>/ /gsi; #strip html newer old my $contentlen = length($content); log3($contentlen); #my $sentences=get_sentences($content); ## Get the sentences #foreach my $sentence (@$sentences) { # $alt_title .= $sentence if (length($alt_title) + length($sentence)) < 100; # $content_desc .= $sentence if (length($content_desc) + length($sentence)) < 500; #} $alt_title = substr( $content, 1, 100); $content_desc = substr( $content, 1, 500); if ($contentlen > 500) { $content_desc .= "..." } if ($xp->find('boolean(./span[@class="rss:title"])', $item)) { log3("yes, there's a title."); $title = $xp->findnodes_as_string('./span[@class="rss:title"]', $item); } else { $title = $alt_title; } $rss->add_item( title => clean_text($title), link => $permalink, description => clean_text($content_desc) ); } } $rss->save($save_path); sub clean_text { my($text) = @_; $text =~s/\r//gsi; # delete carraige returns - abk 0926 $text =~s/\s*\n\s*/ /gsi; # replace returns w/ space $text =~ s/<[^<>]*>/ /gsi; #strip html newer old $text =~ s/^\s*//; # strip leading spaces $text =~ s/\s*$//; # strip trailing spaces $text =~ s/\s+/ /g; # strip multiple spaces return $text; } sub log1 { # WARN my $logmsg = shift; # return unless VERBOSE >= 1; print STDERR "WARN: $logmsg\n"; } sub log2 { # INFO my $logmsg = shift; # return unless VERBOSE >= 2; print STDERR "INFO: $logmsg\n"; } sub log3 { # DBUG my $logmsg = shift; # return unless VERBOSE >= 3; print STDERR "DBUG: $logmsg\n"; }