package NMSCharset; use strict; =head1 NAME NMSCharset - a charset-aware object for handling text strings =head1 SYNOPSIS my $cs = NMSCharset->new('iso-8859-1'); my $safe_to_put_in_html = $cs->escape($untrusted_user_input); my $printable = &{ $cs->strip_nonprint_coderef }( $input ); my $escaped = &{ $cs->escape_html_coderef }( $printable ); =head1 DESCRIPTION Each object of class C is bound to a particular character set when it is created. The object provides methods to generate coderefs to perform a couple of character set dependent operations on text strings. =cut =head1 CONSTRUCTORS =over =item new ( CHARSET ) Creates a new C object, suitable for handing text in the character set CHARSET. The CHARSET parameter must be a character set string, such as C or C for example. =cut sub new { my ($pkg, $charset) = @_; my $self = { CHARSET => $charset }; if ($charset =~ /^utf-8$/i) { $self->{SN} = \&_strip_nonprint_utf8; $self->{EH} = \&_escape_html_utf8; } elsif ($charset =~ /^iso-8859/i) { $self->{SN} = \&_strip_nonprint_8859; if ($charset =~ /^iso-8859-1$/i) { $self->{EH} = \&_escape_html_8859_1; } else { $self->{EH} = \&_escape_html_8859; } } elsif ($charset =~ /^us-ascii$/i) { $self->{SN} = \&_strip_nonprint_ascii; $self->{EH} = \&_escape_html_8859_1; } else { $self->{SN} = \&_strip_nonprint_weak; $self->{EH} = \&_escape_html_weak; } return bless $self, $pkg; } =back =head1 METHODS =over =item charset () Returns the CHARSET string that was passed to the constructor. =cut sub charset { my ($self) = @_; return $self->{CHARSET}; } =item escape ( STRING ) Returns a copy of STRING with runs of non-printable characters replaced with spaces and HTML metacharacters replaced with the equivalent entities. If STRING is undef then the empty string will be returned. =cut sub escape { my ($self, $string) = @_; return &{ $self->{EH} }( &{ $self->{SN} }($string) ); } =item strip_nonprint_coderef () Returns a reference to a sub to replace runs of non-printable characters with spaces, in a manner suited to the charset in use. The returned coderef points to a sub that takes a single readonly string argument and returns a modified version of the string. If undef is passed to the function then the empty string will be returned. =cut sub strip_nonprint_coderef { my ($self) = @_; return $self->{SN}; } =item escape_html_coderef () Returns a reference to a sub to escape HTML metacharacters in a manner suited to the charset in use. The returned coderef points to a sub that takes a single readonly string argument and returns a modified version of the string. =cut sub escape_html_coderef { my ($self) = @_; return $self->{EH}; } =back =head1 PRIVATE TABLES =over =item C<%eschtml_map> The C<%eschtml_map> hash maps C characters to the equivalent HTML entities. =cut use vars qw(%eschtml_map); %eschtml_map = ( ( map {chr($_) => "&#$_;"} (0..255) ), '<' => '<', '>' => '>', '&' => '&', '"' => '"', ); =back =head1 PRIVATE FUNCTIONS These functions are returned by the strip_nonprint_coderef() and escape_html_coderef() methods and invoked by the escape() method. The function most appropriate to the character set in use will be chosen. =over =item _strip_nonprint_utf8 Returns a copy of STRING with everything but printable C characters and valid C multibyte sequences replaced with space characters. =cut sub _strip_nonprint_utf8 { my ($string) = @_; return '' unless defined $string; $string =~ s% ( [\t\n\040-\176] # printable us-ascii | [\xC2-\xDF][\x80-\xBF] # U+00000080 to U+000007FF | \xE0[\xA0-\xBF][\x80-\xBF] # U+00000800 to U+00000FFF | [\xE1-\xEF][\x80-\xBF]{2} # U+00001000 to U+0000FFFF | \xF0[\x90-\xBF][\x80-\xBF]{2} # U+00010000 to U+0003FFFF | [\xF1-\xF7][\x80-\xBF]{3} # U+00040000 to U+001FFFFF | \xF8[\x88-\xBF][\x80-\xBF]{3} # U+00200000 to U+00FFFFFF | [\xF9-\xFB][\x80-\xBF]{4} # U+01000000 to U+03FFFFFF | \xFC[\x84-\xBF][\x80-\xBF]{4} # U+04000000 to U+3FFFFFFF | \xFD[\x80-\xBF]{5} # U+40000000 to U+7FFFFFFF ) | . % defined $1 ? $1 : ' ' %gexs; # # U+FFFE, U+FFFF and U+D800 to U+DFFF are dangerous and # should be treated as invalid combinations, according to # http://www.cl.cam.ac.uk/~mgk25/unicode.html # $string =~ s%\xEF\xBF[\xBE-\xBF]% %g; $string =~ s%\xED[\xA0-\xBF][\x80-\xBF]% %g; return $string; } =item _escape_html_utf8 ( STRING ) Returns a copy of STRING with any HTML metacharacters escaped. Escapes all but the most commonly occurring C characters and bytes that might form part of valid C multibyte sequences. =cut sub _escape_html_utf8 { my ($string) = @_; $string =~ s|([^\w \t\r\n\-\.\,\x80-\xFD])| $eschtml_map{$1} |ge; return $string; } =item _strip_nonprint_weak ( STRING ) Returns a copy of STRING with sequences of NULL characters replaced with space characters. =cut sub _strip_nonprint_weak { my ($string) = @_; return '' unless defined $string; $string =~ s/\0+/ /g; return $string; } =item _escape_html_weak ( STRING ) Returns a copy of STRING with any HTML metacharacters escaped. In order to work in any charset, escapes only E, E, C<"> and C<&> characters. =cut sub _escape_html_weak { my ($string) = @_; $string =~ s/[<>"&]/$eschtml_map{$1}/eg; return $string; } =item _escape_html_8859_1 ( STRING ) Returns a copy of STRING with all but the most commonly occurring printable characters replaced with HTML entities. Only suitable for C or C input. =cut sub _escape_html_8859_1 { my ($string) = @_; $string =~ s|([^\w \t\r\n\-\.\,])| $eschtml_map{$1} |ge; return $string; } =item _escape_html_8859 ( STRING ) Returns a copy of STRING with all but the most commonly occurring printable C characters and characters that might be printable in some C charset replaced with HTML entities. =cut sub _escape_html_8859 { my ($string) = @_; $string =~ s|([^\w \t\r\n\-\.\,\240-\377])| $eschtml_map{$1} |ge; return $string; } =item _strip_nonprint_8859 ( STRING ) Returns a copy of STRING with runs of characters that are not printable in any C charset replaced with spaces. =cut sub _strip_nonprint_8859 { my ($string) = @_; return '' unless defined $string; $string =~ tr#\t\n\040-\176\240-\377# #cs; return $string; } =item _strip_nonprint_ascii ( STRING ) Returns a copy of STRING with runs of characters that are not printable C replaced with spaces. =cut sub _strip_nonprint_ascii { my ($string) = @_; return '' unless defined $string; $string =~ tr#\t\n\040-\176# #cs; return $string; } =back =head1 MAINTAINERS The NMS project, Ehttp://nms-cgi.sourceforge.net/E To request support or report bugs, please email Enms-cgi-support@lists.sourceforge.netE =head1 COPYRIGHT Copyright 2002 - 2004 London Perl Mongers, All rights reserved =head1 LICENSE This module is free software; you are free to redistribute it and/or modify it under the same terms as Perl itself. =cut 1;