#!/usr/local/bin/perl
$VERSION = "1.14";
# CGI code.pl
# Version 1.14
# Part of "Cyrillic Software Suite"
# Get docs and newest version from
# http://www.neystadt.org/cyrillic/
#
# Copyright (c) 1997-98, John Neystadt
# You may install this script on your web site for free
# To obtain permision for redistribution or any other usage
# contact john@neystadt.org.
#
# Drop me a line if you deploy this script on your site.
=head1 NAME
code.pl v1.14 - CGI script to convert on-the-fly html pages across cyrillic charsets
=cut
use Convert::Cyrillic;
use LWP::UserAgent;
use HTTP::Headers::UserAgent;
$path=".."; # <==== path from cgi-bin to the server root.
$defcode="WIN"; # <==== default source encoding
$maxsize=500000; # maximum file size
$IndexFileName = 'index.html';
$UserAgent=$ENV{HTTP_USER_AGENT};
$scrname=$ENV{SCRIPT_NAME};
$file=$ENV{PATH_INFO};
$file=~s/^$scrname//;
$file=~s/\+/ /go;
$file=~s/%(..)/pack("c",hex($1))/ge;
if ($file=~/[\.\/\\]([^\.\/\\]+)$/o) {$ext=lc($1);} else {$ext='html';}
$file=~s%^\/([^\/]*)%%o;
$lang=uc($+);
if ($lang eq 'RUS') {
print "Content-type: text/html\n
Select Russian encoding:
";
goto end;
}
if ($lang=~/(.*)-(.*)/o) { $charset=$1; $lang=$2; }
if (!(',ISO,KOI8,KOI,DOS,WIN,VOL,MAC,UTF8,NOCS,AUTO,' =~ /,$lang,/i)) {
$err = "Unsupported code - $lang";
goto error;
}
$file =~ s|http:/([^/])|http://$1|oi; # Some vers of Ms-IIS merge '//' into '/' in Urls
if ($file =~ s|^/(http://)|$1|oi) {
$url=$ENV {'QUERY_STRING'};
if ($url) { $url= "?" . $url; }
$url = $file . $url;
my $ua = new LWP::UserAgent;
$ua->agent("code.pl/$VERSION " . $ua->agent);
$ua->from ('leonid@neystadt.org');
my $req = new HTTP::Request (GET => $url);
my $res = $ua->request ($req);
if (!$res->is_success) {
my $err = $res->error_as_HTML();
print <<"EOF";
Content-Type: text/html
Failure
Failed to retrive url: $url.
Remote server returned the following reponse:
$err
EOF
goto end;
}
$type = $res->content_type;
$buffer = $res->content;
#neystadt::http_rtr::Http_Retrieve ($url, $buffer, $hdrs);
#$hdrs=~/Content-Type: (.*)\n/io; $type = $1;
} else {
if ($file=~/cgi-bin/io) {
$err = "Incorrect file name";
goto error;
}
$file = "$path$file";
if (-d $file) {
$file = "$file/$IndexFileName";
$ext = 'htm';
}
if (open In,"$file") {
binmode In; read (In, $buffer, $maxsize); close In;
} else {
print "Content-type: text/html
HTTP ErrorError: 404 Not Found
The requested URI $file does not exist.
";
goto end;
}
}
if ($lang=~/auto/io){
$platform = HTTP::Headers::UserAgent::GetPlatform ($UserAgent);
$lang='koi';
$lang='win' if $platform=~/WIN/io;
$lang='mac' if $platform eq 'MAC';
$lang='koi' if $platform eq 'UNIX';
$lang='dos' if $platform eq 'OS2';
$lang='nocs' if $platform eq 'Linux';
}
$newcharset = "koi8-r" if $lang=~/koi|nocs/io;
$newcharset = "windows-1251" if $lang=~/win/io;
$newcharset = "x-mac-cyrillic" if $lang=~/mac/io;
$newcharset = "ibm866" if $lang=~/dos/io;
$newcharset = "ISO-8859-5" if $lang=~/iso/io;
$newcharset = "utf-8" if $lang=~/utf8/io;
if ($buffer=~s/<\s*META\s+HTTP-EQUIV\s*=\s*"?Content-Type"?\s+CONTENT\s*=\s*"?(.*);\s+charset\s*=\s*(.*)"?\s*>//io) {
$type=$1; $charset=$2 if !$charset;
if ($lang=~/nocs|vol/io){
$buffer=~s/<\s*META\s+HTTP-EQUIV\s*=\s*"?Content-Type"?\s+CONTENT\s*=\s*"?(.*);\s+charset\s*=\s*(.*)"?\s*>//io;
}
}
else {
$type="text/html" if $ext eq 'html' || $ext eq 'htm';
$type="text/plain" if $ext eq 'txt';
$type="image/gif" if $ext eq 'gif';
$type="image/jpeg" if $ext eq 'jpg' || $ext eq 'jpeg';
}
$lang="koi8" if $lang=~/nocs/io;
$type="text/html" if !$type;
$slang=$defcode;
$slang="KOI8" if $charset=~/koi/io;
$slang="WIN" if $charset=~/1251/io;
$slang="ISO" if $charset=~/iso/io;
$slang="DOS" if $charset=~/alt/io;
$slang="MAC" if $charset=~/mac/io;
$slang="UTF8" if $charset=~/utf/io;
$slang="UTF8" if $charset=~/unicode/io;
# translate the page
$buffer = Convert::Cyrillic::cstocs ($slang,$lang,$buffer)
if $type =~ /text/o;
if ($hdrs) {
binmode STDOUT;
print $hdrs;
} else {
print("Content-type: $type\n\n");
binmode STDOUT;
}
print $buffer;
goto end;
error:
ermsg($err);
end:;
sub ermsg {
if (!$sw) {$sw=1; print "Content-type: text/plain\n\n";}
print "@_[0]\n";
}
__END__
=head1 DESCRIPTION
Many Russia WWW servers are based on modified APACHE so, that different encodings are returned when clients connect to
different server ports or to different subdomains. This is convenient for servers in Russia, but cannot be used abroad for
Web sites using virtual servers or just having some space at an Internet provider's server. The following approach solves
the problem by using one CGI script without any changes in WWW server software.
Those are code.pl features:
=over
=item *
Can translate localy stored files
=item *
Can translate remote files, retrieving them via HTTP
=item *
Recognizes source encoding from tag inside
=item *
Adjusts the above tag for new encoding or deletes it for buggy browsers.
=item *
Charsets supported:
=over
=item *
B - KOI8-R
=item *
B - WINDOWS-1251
=item *
B - Macintosh
=item *
B - DOS, alternative, CP-866
=item *
B - ISO-8859-5
=item *
B - UTF-8 (Unicode)
=item *
B - Volapuk (transliteration)
=item *
B - KOI8-R, deleting Content-Type META tag, for buggy browsers
=back
=back
=head1 USAGE
=over
=item 1
Put the script in your cgi-bin directory.
=item 2
Edit the script to set script parameters to your configuration
=over
=item *
$path=".."; # <==== path from cgi-bin to the server root.
=item *
$defcode="WIN"; # <==== default source encoding
=item *
$IndexFileName = 'index.html'; # default.htm or index.html, depending on your server
=back
=item 3
Refer to the script as:
I/URL> to be translated.
=over
=item 1
B is one of the above encodings
=item 2
B can also also be of form 'fromcode-tocode' for
explicit definition of the original file encoding.
=item 3
B is absolute URL from the server root (Don't forget to set B<$path> in code.pl) or full URL like http://cnn.com.
=back
=back
All relative references from this page to other WEB pages will be also translated through the same code table (isn't
supported yet for full URLs).
Source encoding is determined by the following algorithm. The first matching rule from this list is selected.
=over
=item 1
If B specified by B form, B is the source encoding.
=item 2
If Metatag like: is present its charset is used. The tag
is updated during translation by replacing source encoding by the destination one.
=item 3
Default encoding is taken from variable $defcode in code.pl.
=back
=head2 CAVEATS
It is recommended that you put on all your pages, and choose only destination encoding
in urls. Do not worry for old buggy browsers which can't display correctly pages with this metatag NOCS encoding converts
page to koi8 and deletes the metatag.
=head1 TIPS AND TRICKS
If you use APPACHE you can add the lines similar to those to your webserver configuration files:
ScriptAlias /koi8 /home/www/neystadt/cgi-bin/code.pl/koi8
ScriptAlias /win /home/www/neystadt/cgi-bin/code.pl/win
ScriptAlias /dos /home/www/neystadt/cgi-bin/code.pl/dos
ScriptAlias /mac /home/www/neystadt/cgi-bin/code.pl/mac
ScriptAlias /iso /home/www/neystadt/cgi-bin/code.pl/iso
ScriptAlias /utf8 /home/www/neystadt/cgi-bin/code.pl/utf8
ScriptAlias /vol /home/www/neystadt/cgi-bin/code.pl/vol
ScriptAlias /lat /home/www/neystadt/cgi-bin/code.pl/vol
ScriptAlias /nocs /home/www/neystadt/cgi-bin/code.pl/nocs
From now you will be able to translate urls like http://www.neystadt.org/russia/ simply by prefixing the url with encoding:
http://www.neystadt.org/koi8/russia/ or http://www.neystadt.org/lat/russia/.
Note that code.pl automatically finds index.html if directory names is given (like in example above). The index file name
can be changed by $IndexFileName parameter in the script.
=head1 EXAMPLES
To translate http://www.neystadt.org/vist/ from Windows-1251 to KOI8:
http://www.neystadt.org/cgi-bin/code.pl/win-koi8/vist/
To translate output of the script http://www.neystadt.org/cgi-bin/miitqr.pl?abc from its default encoding to KOI8:
http://www.neystadt.org/cgi-bin/code.pl/koi8/http://www.neystadt.org/cgi-bin/miitqr.pl?abc
=head1 PREREQUISITES
This script requires the C, C and C
modules available from CPAN or at http://www.neystadt.org/cyrillic/.
=pod OSNAMES
All UNIXes, Windows NT
=pod SCRIPT CATEGORIES
CGI/Filter
=cut