[Moscow.pm] Unicode.

Alexandr Gomoliako zzz zzz.org.ua
20 13:07:53 PST 2012


On Fri, Feb 17, 2012 at 11:43 AM, Orlovsky Alexander
<nordicdyno at yandex.ru> wrote:
>    IT- , ,   ,          Saint Perl "Unicode. ":
> http://nordicdyno.github.com/blog/2012/02/17/unicode-basics/

,      5 ,      :

On Mon, Feb 20, 2012 at 7:27 PM, Tom Christiansen <tchrist at perl.com> wrote:
> Inspired by how scandalously Unicode-deficient the
> otherwise fine 4-way polyglot table comparing PHP, Perl,
> Python, and Ruby is at
>
>  http://hyperpolyglot.org/scripting
>
> I created a quick Unicode cheatsheet for Perl, mostly by
> mining the examples in the new 4th edition of the came.
>
> Gee, I foresee a *whole* lot of "impossibles" in the
> other three languages' columns, don't you? :)
>
> Hm, have I left anything out that Perl is especially cool with?
>
> I almost wonder whether this sort of thing oughtn't be a manpage,
> something like perluni{ref,cheat,quick}?
>
> --tom
>
> =Characters and their numbers
>
>  # ASCII
>  ord("A")
>  chr(65)
>
>  # BMP
>    ord("Σ")
>  chr(0x3A3)
>
>  # beyond the BMP
>    ord("��")
>  chr(0x1D45B)
>
>  # beyond Unicode (up to MAXINT)
>  ord("\x{20_0000}")
>  chr(0x20_0000)
>
> =Unicode literals by character number
>
>  String: "\x{3a3}"
>  Regex: /\x{3a3}/
>
>  String: "\x{1d45b}"
>  Regex: /\x{1d45b}/
>
>  # even non-BMP ranges in regex work fine
>  /[\x{1D434}-\x{1D467}]/
>
> =Get character name by number
>
>  use charnames ();
>  my $name = charnames::viacode(0x03A3);
>
> =Get character number by name
>
>  use charnames ();
>  my $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA");
>
> =Unicode named characters
>
>  use charnames qw(:full :short greek);
>
>  "\N{MATHEMATICAL ITALIC SMALL N}"
>  "\N{GREEK CAPITAL LETTER SIGMA}"
>  "\N{Greek:Sigma}"
>  "\N{epsilon}"
>
> =Unicode named sequences
>
>  use charnames qw(:full);
>  my $seq = "\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}";
>  printf "U+%v04X\n", $seq;
>  U+0100.0300
>
> =Custom named characters
>
>  use charnames ":full", ":alias" => {
>    ecute => "LATIN SMALL LETTER E WITH ACUTE",
>    "APPLE LOGO" => 0xF8FF, # private use character
>  };
>
>  "\N{ecute}"
>  "\N{APPLE LOGO}"
>
> =Declare source in utf8 for identifiers and literals
>
>  use utf8;
>
>    my $measure   = "Ångström";
>    my @μsoft     = qw( cp852 cp1251 cp1252 );
>    my @ὑπέρμεγας = qw( ὑπέρ μεγας );
>    my @鯉        = qw( koi8–f koi8–u koi8–r );
>
> =Unicode casing
>
>    uc("henry ⅷ")  # "HENRY Ⅷ"
>    uc("tschüß")   # "TSHUESS"
>
>  # both are true:
>    "tschüß"  =~ /TSHUESS/i
>    "Σίσυφος" =~ /ΣΊΣΥΦΟΣ/i
>
> =Unicode case-insensitive comparisons
>
>  use utf8;
>  use feature "fc"; # fc() function is from v5.16
>
>  # sort case-insensitively
>  my @sorted = sort { fc($a) cmp fc($b) } @list;
>
>  # both are true:
>    fc("tschüß")  eq fc("TSHUESS")
>    fc("Σίσυφος") eq fc("ΣΊΣΥΦΟΣ")
>
> =Match Unicode linebreak sequence in regex
>
>  \R
>
>  s/\R/\n/g; # normalize all linebreaks to \n
>
> =Match Unicode properties in regex with \p, \P
>
>  \pL, \pN, \pS, \pP, \pM, \pZ, \pC
>  \p{Sk}, \p{Ps}, \p{Lt}
>  \p{alpha}, \p{upper}, \p{lower}
>  \p{Latin}, \p{Greek}
>  \p{script=Latin}, \p{script=Greek}
>  \p{East_Asian_Width=Wide}, \p{EA=W}
>  \p{Line_Break=Hyphen}, \p{LB=HY}
>  \p{Numeric_Value=4}, \p{NV=4}
>
> =Custom character properties
>
>  # using private-use characters
>  sub In_Tengwar { "E000\tE07F\n" }
>
>  if (/\p{In_Tengwar}/) { ... }
>
>  # blending existing properties
>  sub Is_GraecoRoman_Title {<<'END_OF_SET'}
>  +utf8::IsLatin
>  +utf8::IsGreek
>  &utf8::IsTitle
>  END_OF_SET
>
>  if (/\p{Is_GraecoRoman_Title}/ { ... }
>
> =Get character category
>
>  use Unicode::UCD qw(charinfo);
>  my $cat = charinfo(0x3A3)->{category}; # "Lu"
>
> =Convert non-ASCII Unicode numerics
>
>  # from v5.12
>  use Unicode::UCD qw(num);
>  if (/(\d+|\N)) { # not just ASCII!
>    $nv = num($1);
>  }
>
>  use charnames qw(:full);
>  my $nv = num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
>
> =Match Unicode grapheme cluster in regex
>
>  \X
>
>  # match and grab five first graphemes
>  my($first_five) = /^(\X{5})/;
>
>  # Find vowel plus any diacritics
>  use Unicode::Normalize;
>  my $nfd = NFD($orig);
>  $nfd =~ /(?=[aeiou])\X/i
>
> =Reverse string by grapheme
>
>  $str = join("", reverse $str =~ /\X/g);
>
>  # OR: cpan -i Unicode::GCString
>  use Unicode::GCString;
>  $str = reverse Unicode::GCString->new($str);
>
> =String length in graphemes
>
>  my $count = 0;
>  while ($str =~ /\X/) { $count++ }
>
> # OR: cpan -i Unicode::GCString
>  use Unicode::GCString;
>  $gcs = Unicode::GCString->new($str);
>  my $count = $gcs->length;
>
> =Substring by grapheme
>
> # cpan -i Unicode::GCString
>  use Unicode::GCString;
>  $gcs = Unicode::GCString->new($str);
>  my $piece = $gcs->substr(5, 5);
>
> =Unicode column-width for printing
>
> # cpan -i Unicode::GCString
>  use Unicode::GCString;
>  $gcs = Unicode::GCString->new($str);
>  my $cols = $gcs->columns;
>  printf "%*s\n", $cols, $str,
>
> =Unicode normalization
>
>  use Unicode::Normalize;
>  my $nfd = NFD($orig);
>  my $nfc = NFC($orig);
>  my $nfkd = NFKD($orig);
>  my $nfkc = NFKC($orig);
>
> =Unicode collation
>
>  use Unicode::Collate;
>  my $col = Unicode::Collate->new();
>  my @list = $col->sort(@old_list);
>
> =Case- *and* accent-insensitive Unicode sort
>
>  use Unicode::Collate;
>  my $col = Unicode::Collate->new(level => 1);
>  my @list = $col->sort(@old_list);
>
> =Unicode locale collation
>
>  # either use v5.12, OR: cpan -i Unicode::Collate::Locale
>  use Unicode::Collate::Locale;
>  my $col = Unicode::Collate::Locale->new(locale => "de__phonebook");
>  my @list = $col->sort(@old_list);
>
> =Case- *and* accent-insensitive comparisons
>
>  use utf8;
>  use Unicode::Collate;
>    my $coll = Unicode::Collate–>new(
>        level => 1,
>        normalization => undef
>  );
>
> # now both are true:
>    $coll->eq("García",  "GARCIA" );
>    $coll->eq("Márquez", "MARQUEZ");
>
> =Unicode linebreaking
>
>  # cpan -i Unicode::LineBreak
>  use Unicode::LineBreak;
>  use charnames qw(:full);
>
>  my $para = "This is a super\N{HYPHEN}long string. " x 20;
>  my $fmt = new Unicode::LineBreak;
>  print $fmt->break($para), "\n";
>
> =Declare std streams to be utf8
>
>    $ perl -CS ...
>  or
>    $ export PERL_UNICODE=S
>  or
>    use open qw(:std :utf8);
>  or
>    binmode(STDIN, ":utf8");
>    binmode(STDOUT, ":utf8");
>    binmode(STDERR, ":utf8");
>
> =Make I/O default to utf8
>
>    $ perl -CSD ...
>  or
>    $ export PERL_UNICODE=SD
>  or
>    use open qw(:std :utf8);
>
> =Open file with implicit encode/decode
>
>  # input file
>    open(my $in_file, "< :encoding(UTF-16)", "wintext");
>  OR
>    open(my $in_file, "<", "wintext");
>    binmode($in_file, ":encoding(UTF-16)");
>  THEN
>    my $line = <$in_file>;
>
>  # output file
>    open($out_file, "> :encoding(cp1252)", "wintext");
>  OR
>    open(my $out_file, ">", "wintext");
>    binmode($out_file, ":encoding(cp1252)");
>  THEN
>    print $out_file "some text\n";
>
> =Explicit encode/decode [rarely needed, see previous]
>
>  use Encode qw(encode decode);
>
>    my $chars = decode("shiftjis", $bytes);
>  OR
>        my $bytes = encode("MIME–Header–ISO_2022_JP", $chars);


Moscow-pm