I've been thinking that I fully understand the Perl UTF-8 flag and Unicode stuff very well, with the professional experience handling I18N and L10N issues with Perl for more than 5 years.
But it turns out that I still have something to learn, or things I've learned recently at least.
So here's the code.
#!/usr/bin/perl use strict; use warnings; use Encode; use File::Temp qw(tempfile);
use XML::RSS; use XML::RSS::LibXML; use XML::Atom::Feed; use Test::More 'no_plan';
$XML::Atom::ForceUnicode = 1; $XML::Atom::DefaultVersion = "1.0";
my %data; $data{latin1} = "Diction" . chr(225) . "rios"; $data{utf8} = "Diction" . "\xc3\xa1" . "rios"; $data{unicode} = decode_utf8($data{utf8});
my %code = ( 'XML::RSS' => \&test_xml_rss, 'XML::RSS::LibXML' => \&test_xml_rss_libxml, 'XML::Atom' => \&test_xml_atom, );
for my $module (qw(XML::RSS XML::RSS::LibXML XML::Atom)) { for my $label (qw(latin1 utf8 unicode)) { $code{$module}->($data{$label}, $label); } }
sub is_same { my($str1, $str2) = map _unicode($_), @_[0..1]; is $str1, $str2, pop(@_); }
sub _unicode { my $str = shift; return $str if utf8::is_utf8($str); return Encode::decode_utf8($str) if $str =~ /\xc3/; return Encode::decode('latin-1', $str); }
sub test_xml_rss { my($string, $label) = @_;
my $rss = XML::RSS->new; $rss->channel(title => $string);
my $xml = $rss->as_string; diag "XML::RSS + $label: is_utf8() = ", utf8::is_utf8($xml) ? 1 : 0;
$rss = XML::RSS->new; eval { my $tmp = write_file($xml); $rss->parsefile($tmp); is_same $rss->channel->{title}, $string, "XML::RSS $label"; }; fail "XML::RSS $label" if $@; }
sub test_xml_rss_libxml { my($string, $label) = @_;
my $rss = XML::RSS::LibXML->new; $rss->channel(title => $string);
my $xml = $rss->as_string; diag "XML::RSS::LibXML + $label: is_utf8() = ", utf8::is_utf8($xml) ? 1 : 0;
$rss = XML::RSS::LibXML->new; eval { my $tmp = write_file($xml); $rss->parsefile($tmp); is_same $rss->channel->{title}, $string, "XML::RSS::LibXML $label"; }; fail "XML::RSS::LibXML $label" if $@; }
sub test_xml_atom { my($string, $label) = @_;
my $feed = XML::Atom::Feed->new; $feed->title($string);
my $xml = $feed->as_xml; diag "XML::Atom + $label: is_utf8() = ", utf8::is_utf8($xml) ? 1 : 0;
eval { my $tmp = write_file($xml); $feed = XML::Atom::Feed->new($tmp); is_same $feed->title, $string, "XML::Atom $label"; }; fail "XML::Atom $label" if $@; }
sub write_file { my $data = shift; my($fh, $name) = tempfile(CLEANUP => 1); print $fh $data; close $fh; return $name; }
binmode $fh, ":utf8";