#!/usr/bin/perl use Encode qw(encode decode from_to); use Data::Dumper; use warnings; use strict; my $str = 'раз два'; print "len: ".length($str)."\n"; print hexdump($str),"\n"; my $arr = utf8ToBytes($str); my $str2 = join('',@$arr); print "len: ".length($str2)."\n"; print hexdump($str2),"\n"; my $str3 = encode('UTF-8', $str); print "len: ".length($str3)."\n"; print hexdump($str3),"\n"; exit; #======================================================================== sub hexdump { my $stuff = shift; my $retbuff = ''; my @stuff = (); return '' unless defined($stuff); for (my $i = 0; $i < length($stuff); $i++) { push @stuff, substr($stuff, $i, 1); } while (@stuff) { my $i = 0; $retbuff .= "\n" if($retbuff); my @currstuff = splice(@stuff, 0, 16); for my $currstuff (@currstuff) { $retbuff .= ' ' unless $i % 4; $retbuff .= ' ' unless $i % 8; $retbuff .= sprintf("%02X ", ord($currstuff)); $i++; } for(; $i < 16; $i++) { $retbuff .= ' ' unless $i % 4; $retbuff .= ' ' unless $i % 8; $retbuff .= ' '; } $retbuff .= ' '; $i = 0; for my $currstuff (@currstuff) { $retbuff .= ' ' unless $i % 4; $retbuff .= ' ' unless $i % 8; if ( $currstuff ge chr(0x20) and $currstuff le chr(0x7E) ) { $retbuff .= $currstuff; } else { $retbuff .= '.'; } $i++; } } return $retbuff; } sub utf8ToBytes { my $str = shift; my $p = 0; my @arr = (); #split // => $str; my @data = (); my $len = length($str); for (my $i = 0; $i < $len; $i++) { push @arr, substr($str, $i, 1); } for (my $i = 0; $i < $len; $i++) { my $c = ord($arr[$i]); if ($c < 128) { push @data, $c; } elsif ($c < 2048) { push @data, (($c >> 6) | 192); push @data, (($c & 63) | 128); } elsif ( (($c & 0xFC00) == 0xD800) && ($i + 1) < $len && ((ord($arr[$i + 1]) & 0xFC00) == 0xDC00) ) { # Surrogate Pair $c = 0x10000 + (($c & 0x03FF) << 10) + (ord($arr[++$i]) & 0x03FF); push @data, (($c >> 18) | 240); push @data, ((($c >> 12) & 63) | 128); push @data, ((($c >> 6) & 63) | 128); push @data, (($c & 63) | 128); } else { push @data, (($c >> 12) | 224); push @data, ((($c >> 6) & 63) | 128); push @data, (($c & 63) | 128); } } return \@data; }