[Moscow.pm] Unicode. Ликбез

Пн Фев 20 13:07:53 PST 2012

On Fri, Feb 17, 2012 at 11:43 AM, Orlovsky Alexander
<nordicdyno at yandex.ru> wrote:
> Я тут себе IT-бложик завел, заодно, кроме всего прочего, опубликовал там текст на основе моего декабрьского доклада на Saint Perl "Unicode. Ликбез":
> http://nordicdyno.github.com/blog/2012/02/17/unicode-basics/

Кстати, может быть тут не все п5п читают, коротко о поддержке юникода в перл:

On Mon, Feb 20, 2012 at 7:27 PM, Tom Christiansen <tchrist at perl.com> wrote:
> Inspired by how scandalously Unicode-deficient the
> otherwise fine 4-way polyglot table comparing PHP, Perl,
> Python, and Ruby is at
>
>    http://hyperpolyglot.org/scripting
>
> I created a quick Unicode cheatsheet for Perl, mostly by
> mining the examples in the new 4th edition of the came.
>
> Gee, I foresee a *whole* lot of "impossibles" in the
> other three languages' columns, don't you? :)
>
> Hm, have I left anything out that Perl is especially cool with?
>
> I almost wonder whether this sort of thing oughtn't be a manpage,
> something like perluni{ref,cheat,quick}?
>
> --tom
>
> =Characters and their numbers
>
>    # ASCII
>    ord("A")
>    chr(65)
>
>    # BMP
>    ord("Σ")
>    chr(0x3A3)
>
>    # beyond the BMP
>    ord("��")
>    chr(0x1D45B)
>
>    # beyond Unicode (up to MAXINT)
>    ord("\x{20_0000}")
>    chr(0x20_0000)
>
> =Unicode literals by character number
>
>    String: "\x{3a3}"
>    Regex:  /\x{3a3}/
>
>    String: "\x{1d45b}"
>    Regex:  /\x{1d45b}/
>
>    # even non-BMP ranges in regex work fine
>    /[\x{1D434}-\x{1D467}]/
>
> =Get character name by number
>
>    use charnames ();
>    my $name = charnames::viacode(0x03A3);
>
> =Get character number by name
>
>    use charnames ();
>    my $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA");
>
> =Unicode named characters
>
>    use charnames qw(:full :short greek);
>
>    "\N{MATHEMATICAL ITALIC SMALL N}"
>    "\N{GREEK CAPITAL LETTER SIGMA}"
>    "\N{Greek:Sigma}"
>    "\N{epsilon}"
>
> =Unicode named sequences
>
>    use charnames qw(:full);
>    my $seq = "\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}";
>    printf "U+%v04X\n", $seq;
>    U+0100.0300
>
> =Custom named characters
>
>    use charnames ":full", ":alias" => {
>        ecute => "LATIN SMALL LETTER E WITH ACUTE",
>        "APPLE LOGO" => 0xF8FF, # private use character
>    };
>
>    "\N{ecute}"
>    "\N{APPLE LOGO}"
>
> =Declare source in utf8 for identifiers and literals
>
>    use utf8;
>
>    my $measure   = "Ångström";
>    my @μsoft     = qw( cp852 cp1251 cp1252 );
>    my @ὑπέρμεγας = qw( ὑπέρ μεγας );
>    my @鯉        = qw( koi8–f koi8–u koi8–r );
>
> =Unicode casing
>
>    uc("henry ⅷ")  # "HENRY Ⅷ"
>    uc("tschüß")   # "TSHUESS"
>
>    # both are true:
>    "tschüß"  =~ /TSHUESS/i
>    "Σίσυφος" =~ /ΣΊΣΥΦΟΣ/i
>
> =Unicode case-insensitive comparisons
>
>    use utf8;
>    use feature "fc"; # fc() function is from v5.16
>
>    # sort case-insensitively
>    my @sorted = sort { fc($a) cmp fc($b) } @list;
>
>    # both are true:
>    fc("tschüß")  eq fc("TSHUESS")
>    fc("Σίσυφος") eq fc("ΣΊΣΥΦΟΣ")
>
> =Match Unicode linebreak sequence in regex
>
>    \R
>
>    s/\R/\n/g;  # normalize all linebreaks to \n
>
> =Match Unicode properties in regex with \p, \P
>
>    \pL, \pN, \pS, \pP, \pM, \pZ, \pC
>    \p{Sk}, \p{Ps}, \p{Lt}
>    \p{alpha}, \p{upper}, \p{lower}
>    \p{Latin}, \p{Greek}
>    \p{script=Latin}, \p{script=Greek}
>    \p{East_Asian_Width=Wide}, \p{EA=W}
>    \p{Line_Break=Hyphen}, \p{LB=HY}
>    \p{Numeric_Value=4}, \p{NV=4}
>
> =Custom character properties
>
>    # using private-use characters
>    sub In_Tengwar { "E000\tE07F\n" }
>
>    if (/\p{In_Tengwar}/) { ... }
>
>    # blending existing properties
>    sub Is_GraecoRoman_Title {<<'END_OF_SET'}
>    +utf8::IsLatin
>    +utf8::IsGreek
>    &utf8::IsTitle
>    END_OF_SET
>
>    if (/\p{Is_GraecoRoman_Title}/ { ... }
>
> =Get character category
>
>    use Unicode::UCD qw(charinfo);
>    my $cat = charinfo(0x3A3)->{category};  # "Lu"
>
> =Convert non-ASCII Unicode numerics
>
>    # from v5.12
>    use Unicode::UCD qw(num);
>    if (/(\d+|\N)) {  # not just ASCII!
>        $nv = num($1);
>    }
>
>    use charnames qw(:full);
>    my $nv = num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
>
> =Match Unicode grapheme cluster in regex
>
>    \X
>
>    # match and grab five first graphemes
>    my($first_five) = /^(\X{5})/;
>
>    # Find vowel plus any diacritics
>    use Unicode::Normalize;
>    my $nfd = NFD($orig);
>    $nfd =~ /(?=[aeiou])\X/i
>
> =Reverse string by grapheme
>
>    $str = join("", reverse $str =~ /\X/g);
>
>    # OR: cpan -i Unicode::GCString
>    use Unicode::GCString;
>    $str = reverse Unicode::GCString->new($str);
>
> =String length in graphemes
>
>    my $count = 0;
>    while ($str =~ /\X/) { $count++ }
>
>  # OR: cpan -i Unicode::GCString
>    use Unicode::GCString;
>    $gcs = Unicode::GCString->new($str);
>    my $count = $gcs->length;
>
> =Substring by grapheme
>
>  # cpan -i Unicode::GCString
>    use Unicode::GCString;
>    $gcs = Unicode::GCString->new($str);
>    my $piece = $gcs->substr(5, 5);
>
> =Unicode column-width for printing
>
>  # cpan -i Unicode::GCString
>    use Unicode::GCString;
>    $gcs = Unicode::GCString->new($str);
>    my $cols = $gcs->columns;
>    printf "%*s\n", $cols, $str,
>
> =Unicode normalization
>
>    use Unicode::Normalize;
>    my $nfd  = NFD($orig);
>    my $nfc  = NFC($orig);
>    my $nfkd = NFKD($orig);
>    my $nfkc = NFKC($orig);
>
> =Unicode collation
>
>    use Unicode::Collate;
>    my $col = Unicode::Collate->new();
>    my @list = $col->sort(@old_list);
>
> =Case- *and* accent-insensitive Unicode sort
>
>    use Unicode::Collate;
>    my $col = Unicode::Collate->new(level => 1);
>    my @list = $col->sort(@old_list);
>
> =Unicode locale collation
>
>    # either use v5.12, OR: cpan -i Unicode::Collate::Locale
>    use Unicode::Collate::Locale;
>    my $col = Unicode::Collate::Locale->new(locale => "de__phonebook");
>    my @list = $col->sort(@old_list);
>
> =Case- *and* accent-insensitive comparisons
>
>    use utf8;
>    use Unicode::Collate;
>    my $coll = Unicode::Collate–>new(
>                level => 1,
>                normalization => undef
>    );
>
>  # now both are true:
>    $coll->eq("García",  "GARCIA" );
>    $coll->eq("Márquez", "MARQUEZ");
>
> =Unicode linebreaking
>
>    # cpan -i Unicode::LineBreak
>    use Unicode::LineBreak;
>    use charnames qw(:full);
>
>    my $para = "This is a super\N{HYPHEN}long string. " x 20;
>    my $fmt = new Unicode::LineBreak;
>    print $fmt->break($para), "\n";
>
> =Declare std streams to be utf8
>
>        $ perl -CS ...
>    or
>        $ export PERL_UNICODE=S
>    or
>        use open qw(:std :utf8);
>    or
>        binmode(STDIN,  ":utf8");
>        binmode(STDOUT, ":utf8");
>        binmode(STDERR, ":utf8");
>
> =Make I/O default to utf8
>
>        $ perl -CSD ...
>    or
>        $ export PERL_UNICODE=SD
>    or
>        use open qw(:std :utf8);
>
> =Open file with implicit encode/decode
>
>    # input file
>        open(my $in_file, "< :encoding(UTF-16)", "wintext");
>    OR
>        open(my $in_file, "<", "wintext");
>        binmode($in_file, ":encoding(UTF-16)");
>    THEN
>        my $line = <$in_file>;
>
>    # output file
>        open($out_file, "> :encoding(cp1252)", "wintext");
>    OR
>        open(my $out_file, ">", "wintext");
>        binmode($out_file, ":encoding(cp1252)");
>    THEN
>        print $out_file "some text\n";
>
> =Explicit encode/decode  [rarely needed, see previous]
>
>    use Encode qw(encode decode);
>
>        my $chars = decode("shiftjis", $bytes);
>    OR
>        my $bytes = encode("MIME–Header–ISO_2022_JP", $chars);