[Moscow.pm] Unicode. ìÉËÂÅÚ

Alexandr Gomoliako zzz ÎÁ zzz.org.ua
ðÎ æÅ× 20 13:07:53 PST 2012


On Fri, Feb 17, 2012 at 11:43 AM, Orlovsky Alexander
<nordicdyno at yandex.ru> wrote:
> ñ ÔÕÔ ÓÅÂÅ IT-ÂÌÏÖÉË ÚÁ×ÅÌ, ÚÁÏÄÎÏ, ËÒÏÍÅ ×ÓÅÇÏ ÐÒÏÞÅÇÏ, ÏÐÕÂÌÉËÏ×ÁÌ ÔÁÍ ÔÅËÓÔ ÎÁ ÏÓÎÏ×Å ÍÏÅÇÏ ÄÅËÁÂÒØÓËÏÇÏ ÄÏËÌÁÄÁ ÎÁ Saint Perl "Unicode. ìÉËÂÅÚ":
> http://nordicdyno.github.com/blog/2012/02/17/unicode-basics/

ëÓÔÁÔÉ, ÍÏÖÅÔ ÂÙÔØ ÔÕÔ ÎÅ ×ÓÅ Ð5Ð ÞÉÔÁÀÔ, ËÏÒÏÔËÏ Ï ÐÏÄÄÅÒÖËÅ ÀÎÉËÏÄÁ × ÐÅÒÌ:

On Mon, Feb 20, 2012 at 7:27 PM, Tom Christiansen <tchrist at perl.com> wrote:
> Inspired by how scandalously Unicode-deficient the
> otherwise fine 4-way polyglot table comparing PHP, Perl,
> Python, and Ruby is at
>
> š šhttp://hyperpolyglot.org/scripting
>
> I created a quick Unicode cheatsheet for Perl, mostly by
> mining the examples in the new 4th edition of the came.
>
> Gee, I foresee a *whole* lot of "impossibles" in the
> other three languages' columns, don't you? :)
>
> Hm, have I left anything out that Perl is especially cool with?
>
> I almost wonder whether this sort of thing oughtn't be a manpage,
> something like perluni{ref,cheat,quick}?
>
> --tom
>
> =Characters and their numbers
>
> š š# ASCII
> š šord("A")
> š šchr(65)
>
> š š# BMP
>    ord("Σ")
> š šchr(0x3A3)
>
> š š# beyond the BMP
>    ord("��")
> š šchr(0x1D45B)
>
> š š# beyond Unicode (up to MAXINT)
> š šord("\x{20_0000}")
> š šchr(0x20_0000)
>
> =Unicode literals by character number
>
> š šString: "\x{3a3}"
> š šRegex: š/\x{3a3}/
>
> š šString: "\x{1d45b}"
> š šRegex: š/\x{1d45b}/
>
> š š# even non-BMP ranges in regex work fine
> š š/[\x{1D434}-\x{1D467}]/
>
> =Get character name by number
>
> š šuse charnames ();
> š šmy $name = charnames::viacode(0x03A3);
>
> =Get character number by name
>
> š šuse charnames ();
> š šmy $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA");
>
> =Unicode named characters
>
> š šuse charnames qw(:full :short greek);
>
> š š"\N{MATHEMATICAL ITALIC SMALL N}"
> š š"\N{GREEK CAPITAL LETTER SIGMA}"
> š š"\N{Greek:Sigma}"
> š š"\N{epsilon}"
>
> =Unicode named sequences
>
> š šuse charnames qw(:full);
> š šmy $seq = "\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}";
> š šprintf "U+%v04X\n", $seq;
> š šU+0100.0300
>
> =Custom named characters
>
> š šuse charnames ":full", ":alias" => {
> š š š šecute => "LATIN SMALL LETTER E WITH ACUTE",
> š š š š"APPLE LOGO" => 0xF8FF, # private use character
> š š};
>
> š š"\N{ecute}"
> š š"\N{APPLE LOGO}"
>
> =Declare source in utf8 for identifiers and literals
>
> š šuse utf8;
>
>    my $measure   = "Ångström";
>    my @μsoft     = qw( cp852 cp1251 cp1252 );
>    my @ὑπέρμεγας = qw( ὑπέρ μεγας );
>    my @鯉        = qw( koi8–f koi8–u koi8–r );
>
> =Unicode casing
>
>    uc("henry ⅷ")  # "HENRY Ⅷ"
>    uc("tschüß")   # "TSHUESS"
>
> š š# both are true:
>    "tschüß"  =~ /TSHUESS/i
>    "Σίσυφος" =~ /ΣΊΣΥΦΟΣ/i
>
> =Unicode case-insensitive comparisons
>
> š šuse utf8;
> š šuse feature "fc"; # fc() function is from v5.16
>
> š š# sort case-insensitively
> š šmy @sorted = sort { fc($a) cmp fc($b) } @list;
>
> š š# both are true:
>    fc("tschüß")  eq fc("TSHUESS")
>    fc("Σίσυφος") eq fc("ΣΊΣΥΦΟΣ")
>
> =Match Unicode linebreak sequence in regex
>
> š š\R
>
> š šs/\R/\n/g; š# normalize all linebreaks to \n
>
> =Match Unicode properties in regex with \p, \P
>
> š š\pL, \pN, \pS, \pP, \pM, \pZ, \pC
> š š\p{Sk}, \p{Ps}, \p{Lt}
> š š\p{alpha}, \p{upper}, \p{lower}
> š š\p{Latin}, \p{Greek}
> š š\p{script=Latin}, \p{script=Greek}
> š š\p{East_Asian_Width=Wide}, \p{EA=W}
> š š\p{Line_Break=Hyphen}, \p{LB=HY}
> š š\p{Numeric_Value=4}, \p{NV=4}
>
> =Custom character properties
>
> š š# using private-use characters
> š šsub In_Tengwar { "E000\tE07F\n" }
>
> š šif (/\p{In_Tengwar}/) { ... }
>
> š š# blending existing properties
> š šsub Is_GraecoRoman_Title {<<'END_OF_SET'}
> š š+utf8::IsLatin
> š š+utf8::IsGreek
> š š&utf8::IsTitle
> š šEND_OF_SET
>
> š šif (/\p{Is_GraecoRoman_Title}/ { ... }
>
> =Get character category
>
> š šuse Unicode::UCD qw(charinfo);
> š šmy $cat = charinfo(0x3A3)->{category}; š# "Lu"
>
> =Convert non-ASCII Unicode numerics
>
> š š# from v5.12
> š šuse Unicode::UCD qw(num);
> š šif (/(\d+|\N)) { š# not just ASCII!
> š š š š$nv = num($1);
> š š}
>
> š šuse charnames qw(:full);
> š šmy $nv = num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
>
> =Match Unicode grapheme cluster in regex
>
> š š\X
>
> š š# match and grab five first graphemes
> š šmy($first_five) = /^(\X{5})/;
>
> š š# Find vowel plus any diacritics
> š šuse Unicode::Normalize;
> š šmy $nfd = NFD($orig);
> š š$nfd =~ /(?=[aeiou])\X/i
>
> =Reverse string by grapheme
>
> š š$str = join("", reverse $str =~ /\X/g);
>
> š š# OR: cpan -i Unicode::GCString
> š šuse Unicode::GCString;
> š š$str = reverse Unicode::GCString->new($str);
>
> =String length in graphemes
>
> š šmy $count = 0;
> š šwhile ($str =~ /\X/) { $count++ }
>
> š# OR: cpan -i Unicode::GCString
> š šuse Unicode::GCString;
> š š$gcs = Unicode::GCString->new($str);
> š šmy $count = $gcs->length;
>
> =Substring by grapheme
>
> š# cpan -i Unicode::GCString
> š šuse Unicode::GCString;
> š š$gcs = Unicode::GCString->new($str);
> š šmy $piece = $gcs->substr(5, 5);
>
> =Unicode column-width for printing
>
> š# cpan -i Unicode::GCString
> š šuse Unicode::GCString;
> š š$gcs = Unicode::GCString->new($str);
> š šmy $cols = $gcs->columns;
> š šprintf "%*s\n", $cols, $str,
>
> =Unicode normalization
>
> š šuse Unicode::Normalize;
> š šmy $nfd š= NFD($orig);
> š šmy $nfc š= NFC($orig);
> š šmy $nfkd = NFKD($orig);
> š šmy $nfkc = NFKC($orig);
>
> =Unicode collation
>
> š šuse Unicode::Collate;
> š šmy $col = Unicode::Collate->new();
> š šmy @list = $col->sort(@old_list);
>
> =Case- *and* accent-insensitive Unicode sort
>
> š šuse Unicode::Collate;
> š šmy $col = Unicode::Collate->new(level => 1);
> š šmy @list = $col->sort(@old_list);
>
> =Unicode locale collation
>
> š š# either use v5.12, OR: cpan -i Unicode::Collate::Locale
> š šuse Unicode::Collate::Locale;
> š šmy $col = Unicode::Collate::Locale->new(locale => "de__phonebook");
> š šmy @list = $col->sort(@old_list);
>
> =Case- *and* accent-insensitive comparisons
>
> š šuse utf8;
> š šuse Unicode::Collate;
>    my $coll = Unicode::Collate–>new(
> š š š š š š š šlevel => 1,
> š š š š š š š šnormalization => undef
> š š);
>
> š# now both are true:
>    $coll->eq("García",  "GARCIA" );
>    $coll->eq("Márquez", "MARQUEZ");
>
> =Unicode linebreaking
>
> š š# cpan -i Unicode::LineBreak
> š šuse Unicode::LineBreak;
> š šuse charnames qw(:full);
>
> š šmy $para = "This is a super\N{HYPHEN}long string. " x 20;
> š šmy $fmt = new Unicode::LineBreak;
> š šprint $fmt->break($para), "\n";
>
> =Declare std streams to be utf8
>
> š š š š$ perl -CS ...
> š šor
> š š š š$ export PERL_UNICODE=S
> š šor
> š š š šuse open qw(:std :utf8);
> š šor
> š š š šbinmode(STDIN, š":utf8");
> š š š šbinmode(STDOUT, ":utf8");
> š š š šbinmode(STDERR, ":utf8");
>
> =Make I/O default to utf8
>
> š š š š$ perl -CSD ...
> š šor
> š š š š$ export PERL_UNICODE=SD
> š šor
> š š š šuse open qw(:std :utf8);
>
> =Open file with implicit encode/decode
>
> š š# input file
> š š š šopen(my $in_file, "< :encoding(UTF-16)", "wintext");
> š šOR
> š š š šopen(my $in_file, "<", "wintext");
> š š š šbinmode($in_file, ":encoding(UTF-16)");
> š šTHEN
> š š š šmy $line = <$in_file>;
>
> š š# output file
> š š š šopen($out_file, "> :encoding(cp1252)", "wintext");
> š šOR
> š š š šopen(my $out_file, ">", "wintext");
> š š š šbinmode($out_file, ":encoding(cp1252)");
> š šTHEN
> š š š šprint $out_file "some text\n";
>
> =Explicit encode/decode š[rarely needed, see previous]
>
> š šuse Encode qw(encode decode);
>
> š š š šmy $chars = decode("shiftjis", $bytes);
> š šOR
>        my $bytes = encode("MIME–Header–ISO_2022_JP", $chars);


ðÏÄÒÏÂÎÁÑ ÉÎÆÏÒÍÁÃÉÑ Ï ÓÐÉÓËÅ ÒÁÓÓÙÌËÉ Moscow-pm