417 lines
10 KiB
Perl
417 lines
10 KiB
Perl
# --
|
|
# Copyright (C) 2001-2019 OTRS AG, https://otrs.com/
|
|
# --
|
|
# This software comes with ABSOLUTELY NO WARRANTY. For details, see
|
|
# the enclosed file COPYING for license information (GPL). If you
|
|
# did not receive this file, see https://www.gnu.org/licenses/gpl-3.0.txt.
|
|
# --
|
|
|
|
package Kernel::System::Encode;
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use Encode;
|
|
use Encode::Locale;
|
|
use IO::Interactive qw(is_interactive);
|
|
|
|
our %ObjectManagerFlags = (
|
|
IsSingleton => 1,
|
|
);
|
|
our @ObjectDependencies = ();
|
|
|
|
=head1 NAME
|
|
|
|
Kernel::System::Encode - character encodings
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This module will use Perl's Encode module (Perl 5.8.0 or higher is required).
|
|
|
|
=head1 PUBLIC INTERFACE
|
|
|
|
=head2 new()
|
|
|
|
Don't use the constructor directly, use the ObjectManager instead:
|
|
|
|
my $EncodeObject = $Kernel::OM->Get('Kernel::System::Encode');
|
|
|
|
=cut
|
|
|
|
sub new {
|
|
my ( $Type, %Param ) = @_;
|
|
|
|
# allocate new hash for object
|
|
my $Self = {};
|
|
bless( $Self, $Type );
|
|
|
|
# 0=off; 1=on;
|
|
$Self->{Debug} = 0;
|
|
|
|
# use "locale" as an arg to encode/decode
|
|
@ARGV = map { decode( locale => $_, 1 ) } @ARGV; ## no critic
|
|
|
|
# check if the encodeobject is used from the command line
|
|
# if so, we need to decode @ARGV
|
|
if ( !is_interactive() ) {
|
|
|
|
# encode STDOUT and STDERR
|
|
$Self->ConfigureOutputFileHandle( FileHandle => \*STDOUT );
|
|
$Self->ConfigureOutputFileHandle( FileHandle => \*STDERR );
|
|
}
|
|
else {
|
|
|
|
if ( is_interactive(*STDOUT) ) {
|
|
binmode STDOUT, ":encoding(console_out)";
|
|
}
|
|
if ( is_interactive(*STDERR) ) {
|
|
binmode STDERR, ":encoding(console_out)";
|
|
}
|
|
}
|
|
|
|
return $Self;
|
|
}
|
|
|
|
=head2 Convert()
|
|
|
|
Convert a string from one charset to another charset.
|
|
|
|
my $utf8 = $EncodeObject->Convert(
|
|
Text => $iso_8859_1_string,
|
|
From => 'iso-8859-1',
|
|
To => 'utf-8',
|
|
);
|
|
|
|
my $iso_8859_1 = $EncodeObject->Convert(
|
|
Text => $utf-8_string,
|
|
From => 'utf-8',
|
|
To => 'iso-8859-1',
|
|
);
|
|
|
|
There is also a Force => 1 option if you need to force the
|
|
already converted string. And Check => 1 if the string result
|
|
should be checked to be a valid string (e. g. valid utf-8 string).
|
|
|
|
=cut
|
|
|
|
sub Convert {
|
|
my ( $Self, %Param ) = @_;
|
|
|
|
# return if no text is given
|
|
return if !defined $Param{Text};
|
|
return '' if $Param{Text} eq '';
|
|
|
|
# check needed stuff
|
|
for (qw(From To)) {
|
|
if ( !defined $Param{$_} ) {
|
|
print STDERR "Need $_!\n";
|
|
return;
|
|
}
|
|
}
|
|
|
|
# utf8 cleanup
|
|
for my $Key (qw(From To)) {
|
|
$Param{$Key} = lc $Param{$Key};
|
|
if ( $Param{$Key} eq 'utf8' ) {
|
|
$Param{$Key} = 'utf-8';
|
|
}
|
|
}
|
|
|
|
# if no encode is needed
|
|
if ( $Param{From} eq $Param{To} ) {
|
|
|
|
# set utf-8 flag
|
|
if ( $Param{To} eq 'utf-8' ) {
|
|
Encode::_utf8_on( $Param{Text} );
|
|
}
|
|
|
|
# check if string is valid utf-8
|
|
if ( $Param{Check} && !eval { Encode::is_utf8( $Param{Text}, 1 ) } ) {
|
|
Encode::_utf8_off( $Param{Text} );
|
|
|
|
# strip invalid chars / 0 = will put a substitution character in
|
|
# place of a malformed character
|
|
eval { Encode::from_to( $Param{Text}, $Param{From}, $Param{To}, 0 ) };
|
|
|
|
# set utf-8 flag
|
|
Encode::_utf8_on( $Param{Text} );
|
|
|
|
# return new string
|
|
return $Param{Text};
|
|
}
|
|
|
|
# return text
|
|
return $Param{Text};
|
|
}
|
|
|
|
# encode is needed
|
|
if ( $Param{Force} ) {
|
|
Encode::_utf8_off( $Param{Text} );
|
|
}
|
|
|
|
# this is a workaround for following bug in Encode::HanExtra
|
|
# https://rt.cpan.org/Public/Bug/Display.html?id=71720
|
|
# see also http://bugs.otrs.org/show_bug.cgi?id=10121
|
|
# distributed charsets by Encode::HanExtra
|
|
# http://search.cpan.org/~jhi/perl-5.8.1/ext/Encode/lib/Encode/Supported.pod
|
|
my %AdditionalChineseCharsets = (
|
|
'big5ext' => 1,
|
|
'big5plus' => 1,
|
|
'cccii' => 1,
|
|
'euc-tw' => 1,
|
|
'gb18030' => 1,
|
|
);
|
|
|
|
# check if one of the Encode::HanExtra charsets occurs
|
|
if ( $AdditionalChineseCharsets{ $Param{From} } ) {
|
|
|
|
# require module, print error if module was not found
|
|
if ( !eval "require Encode::HanExtra" ) { ## no critic
|
|
print STDERR
|
|
"Charset '$Param{From}' requires Encode::HanExtra, which is not installed!\n";
|
|
}
|
|
}
|
|
|
|
# check if encoding exists
|
|
if ( !Encode::resolve_alias( $Param{From} ) ) {
|
|
my $Fallback = 'iso-8859-1';
|
|
print STDERR "Not supported charset '$Param{From}', fallback to '$Fallback'!\n";
|
|
$Param{From} = $Fallback;
|
|
}
|
|
|
|
# set check for "Handling Malformed Data", for more info see "perldoc Encode -> CHECK"
|
|
|
|
# 1 = methods will die on error immediately with an error
|
|
my $Check = 1;
|
|
|
|
# 0 = will put a substitution character in place of a malformed character
|
|
if ( $Param{Force} ) {
|
|
$Check = 0;
|
|
}
|
|
|
|
# convert string
|
|
if ( !eval { Encode::from_to( $Param{Text}, $Param{From}, $Param{To}, $Check ) } ) {
|
|
|
|
# truncate text for error messages
|
|
my $TruncatedText = $Param{Text};
|
|
if ( length($TruncatedText) > 65 ) {
|
|
$TruncatedText = substr( $TruncatedText, 0, 65 ) . '[...]';
|
|
}
|
|
|
|
print STDERR "Charset encode '$Param{From}' -=> '$Param{To}' ($TruncatedText)"
|
|
. " not supported!\n";
|
|
|
|
# strip invalid chars / 0 = will put a substitution character in place of
|
|
# a malformed character
|
|
eval { Encode::from_to( $Param{Text}, $Param{From}, $Param{To}, 0 ) };
|
|
|
|
# set utf-8 flag
|
|
if ( $Param{To} eq 'utf-8' ) {
|
|
Encode::_utf8_on( $Param{Text} );
|
|
}
|
|
|
|
return $Param{Text};
|
|
}
|
|
|
|
# set utf-8 flag
|
|
if ( $Param{To} eq 'utf-8' ) {
|
|
Encode::_utf8_on( $Param{Text} );
|
|
}
|
|
|
|
# output debug message
|
|
if ( $Self->{Debug} ) {
|
|
print STDERR "Charset encode '$Param{From}' -=> '$Param{To}' ($Param{Text})!\n";
|
|
}
|
|
|
|
return $Param{Text};
|
|
}
|
|
|
|
=head2 Convert2CharsetInternal()
|
|
|
|
Convert given charset into the internal used charset (utf-8).
|
|
Should be used on all I/O interfaces.
|
|
|
|
my $String = $EncodeObject->Convert2CharsetInternal(
|
|
Text => $String,
|
|
From => $SourceCharset,
|
|
);
|
|
|
|
=cut
|
|
|
|
sub Convert2CharsetInternal {
|
|
my ( $Self, %Param ) = @_;
|
|
|
|
return if !defined $Param{Text};
|
|
|
|
# check needed stuff
|
|
if ( !defined $Param{From} ) {
|
|
print STDERR "Need From!\n";
|
|
return;
|
|
}
|
|
|
|
return $Self->Convert( %Param, To => 'utf-8' );
|
|
}
|
|
|
|
=head2 EncodeInput()
|
|
|
|
Convert internal used charset (e. g. utf-8) into given charset (utf-8).
|
|
|
|
Should be used on all I/O interfaces if data is already utf-8 to set the utf-8 stamp.
|
|
|
|
$EncodeObject->EncodeInput( \$String );
|
|
|
|
$EncodeObject->EncodeInput( \@Array );
|
|
|
|
=cut
|
|
|
|
sub EncodeInput {
|
|
my ( $Self, $What ) = @_;
|
|
|
|
return if !defined $What;
|
|
|
|
if ( ref $What eq 'SCALAR' ) {
|
|
return $What if !defined ${$What};
|
|
Encode::_utf8_on( ${$What} );
|
|
return $What;
|
|
}
|
|
|
|
if ( ref $What eq 'ARRAY' ) {
|
|
|
|
ROW:
|
|
for my $Row ( @{$What} ) {
|
|
next ROW if !defined $Row;
|
|
Encode::_utf8_on($Row);
|
|
}
|
|
return $What;
|
|
}
|
|
|
|
Encode::_utf8_on($What);
|
|
|
|
return $What;
|
|
}
|
|
|
|
=head2 EncodeOutput()
|
|
|
|
Convert utf-8 to a sequence of bytes. All possible characters have
|
|
a UTF-8 representation so this function cannot fail.
|
|
|
|
This should be used in for output of utf-8 chars.
|
|
|
|
$EncodeObject->EncodeOutput( \$String );
|
|
|
|
$EncodeObject->EncodeOutput( \@Array );
|
|
|
|
=cut
|
|
|
|
sub EncodeOutput {
|
|
my ( $Self, $What ) = @_;
|
|
|
|
if ( ref $What eq 'SCALAR' ) {
|
|
return $What if !defined ${$What};
|
|
return $What if !Encode::is_utf8( ${$What} );
|
|
${$What} = Encode::encode_utf8( ${$What} );
|
|
return $What;
|
|
}
|
|
|
|
if ( ref $What eq 'ARRAY' ) {
|
|
|
|
ROW:
|
|
for my $Row ( @{$What} ) {
|
|
next ROW if !defined $Row;
|
|
next ROW if !Encode::is_utf8( ${$Row} );
|
|
${$Row} = Encode::encode_utf8( ${$Row} );
|
|
}
|
|
return $What;
|
|
}
|
|
|
|
return $What if !Encode::is_utf8( \$What );
|
|
Encode::encode_utf8( \$What );
|
|
return $What;
|
|
}
|
|
|
|
=head2 ConfigureOutputFileHandle()
|
|
|
|
switch output file handle to utf-8 output.
|
|
|
|
$EncodeObject->ConfigureOutputFileHandle( FileHandle => \*STDOUT );
|
|
|
|
=cut
|
|
|
|
sub ConfigureOutputFileHandle {
|
|
my ( $Self, %Param ) = @_;
|
|
|
|
return if !defined $Param{FileHandle};
|
|
return if ref $Param{FileHandle} ne 'GLOB';
|
|
|
|
# http://www.perlmonks.org/?node_id=644786
|
|
# http://bugs.otrs.org/show_bug.cgi?id=12100
|
|
binmode( $Param{FileHandle}, ':utf8' ); ## no critic
|
|
|
|
return 1;
|
|
}
|
|
|
|
=head2 EncodingIsAsciiSuperset()
|
|
|
|
Checks if an encoding is a super-set of ASCII, that is, encodes the
|
|
codepoints from 0 to 127 the same way as ASCII.
|
|
|
|
my $IsSuperset = $EncodeObject->EncodingIsAsciiSuperset(
|
|
Encoding => 'UTF-8',
|
|
);
|
|
|
|
=cut
|
|
|
|
sub EncodingIsAsciiSuperset {
|
|
my ( $Self, %Param ) = @_;
|
|
if ( !defined $Param{Encoding} ) {
|
|
print STDERR "Need Encoding!\n";
|
|
return;
|
|
}
|
|
if ( !defined Encode::find_encoding( $Param{Encoding} ) ) {
|
|
print STDERR "Unsupported Encoding $Param{Encoding}!\n";
|
|
return;
|
|
}
|
|
my $Test = join '', map {chr} 0 .. 127;
|
|
return Encode::encode( $Param{Encoding}, $Test )
|
|
eq Encode::encode( 'ASCII', $Test );
|
|
}
|
|
|
|
=head2 FindAsciiSupersetEncoding()
|
|
|
|
From a list of character encodings, returns the first that
|
|
is a super-set of ASCII. If none matches, C<ASCII> is returned.
|
|
|
|
my $Encoding = $EncodeObject->FindAsciiSupersetEncoding(
|
|
Encodings => [ 'UTF-16LE', 'UTF-8' ],
|
|
);
|
|
|
|
=cut
|
|
|
|
sub FindAsciiSupersetEncoding {
|
|
my ( $Self, %Param ) = @_;
|
|
if ( !defined $Param{Encodings} ) {
|
|
print STDERR "Need Encodings!\n";
|
|
return;
|
|
}
|
|
ENCODING:
|
|
for my $Encoding ( @{ $Param{Encodings} } ) {
|
|
next ENCODING if !$Encoding;
|
|
if ( $Self->EncodingIsAsciiSuperset( Encoding => $Encoding ) ) {
|
|
return $Encoding;
|
|
}
|
|
}
|
|
return 'ASCII';
|
|
}
|
|
|
|
1;
|
|
|
|
=head1 TERMS AND CONDITIONS
|
|
|
|
This software is part of the OTRS project (L<https://otrs.org/>).
|
|
|
|
This software comes with ABSOLUTELY NO WARRANTY. For details, see
|
|
the enclosed file COPYING for license information (GPL). If you
|
|
did not receive this file, see L<https://www.gnu.org/licenses/gpl-3.0.txt>.
|
|
|
|
=cut
|