| Al Beshenov ( @ 2007-01-31 06:01:00 |
Code highlighting
Hi!
I am not a Perl geek and I am using patterns and recipes to cook scripts.
I am trying to do conversion from C / C++ code to the hypertext with style sheets. Actually, it is a code highlighting.
Script below (sorry for huge code citation) does not work.
- It is stupid and complex :-)
- It highlights special words inside strings.
- It 'eats itself' and replacing
<span class="keyword">to<span <span class="keyword">class</span>="keyword">in some cases! - It can't highlight word at start of line.
- It does not highlights something (why?).
Help me to correct it.
Are there good scripts for highlighting code on C / C++ / Java / Perl / Bash / etc.?
Thanks!
#!/usr/bin/perl -w
# cpp2xhtml.pl
my $directive_class = "directive";
my $comment_class = "comment";
my $function_class = "function";
my $method_class = "method";
my $string_class = "string";
my $character_class = "character";
my $type_class = "type";
my $keyword_class = "keyword";
my $libtype_class = "libtype";
my $comment = 0;
# Words to highlight...
# keywords, operators, etc.:
my @operators_keywords = qw { and and_eq asm bitand bitor break
case catch class compl const_cast continue default delete do
dynamic_cast else enum explicit export false for friend goto if
inline namespace new not not_eq operator or or_eq private
protected public reinterpret_cast restrict return sizeof
static_cast struct switch template this throw true try typedef
typeid typename union using virtual volatile while xor xor_eq };
# modifiers:
my @modifiers = qw { auto const extern long mutable register
short signed static unsigned };
# Basic types / primitives
my @primitives = qw { _Bool _Complex _Complex_I _Imaginary
_Imaginary_I bool char double float int wchar_t void };
# Macro, types, classes, defined by language library:
my @library_words = qw { _IOFBF _IOLBF _IONBF abort adjustfield
allocator allocator_type app ate atexit auto_ptr
back_insert_iterator bad_alloc bad_cast bad_exception bad_typeid
badbit basefield basic_filebuf basic_fstream basic_ifstream
basic_ios basic_iostream basic_istream basic_istringstream
basic_ofstream basic_ostream basic_ostringstream basic_streambuf
basic_string basic_stringbuf basic_stringstream beg
bidirectional_iterator_tag binary binary_function binder1st
binder2nd boolalpha BUFSIZ BUFSIZE cerr CHAR_BIT CHAR_MAX
CHAR_MIN CHAR_T char_traits char_type cin clock_t
CLOCKS_PER_SEC clog codecvt codecvt_byname collate_byname
complex const_iterator const_mem_fun_ref_t const_mem_fun_t
const_mem_fun1_ref_t const_mem_fun1_t const_pointer
const_reference const_reverse_iterator cout ctype ctype_byname
cur DBL_DIG DBL_EPSILON DBL_MANT_DIG DBL_MAX DBL_MAX_10_EXP
DBL_MAX_EXP DBL_MIN DBL_MIN_10_EXP DBL_MIN_EXP dec
difference_type div_t divides domain_error double_t end EOF
eofbit equal_to exception exit EXIT_FAILURE EXIT_SUCCESS failbit
failure FE_ALL_EXCEPT FE_DFL_ENV FE_DIVBYZERO FE_DOWNWARD
FE_INEXACT FE_INVALID FE_OVERFLOW FE_TONEAREST FE_TOWARDZERO
FE_UNDERFLOW FE_UPWARD fenv_t fexcept_t FILE filebuf
FILENAME_MAX fixed float_denorm_style float_round_style float_t
floatfield FLT_DIG FLT_EPSILON FLT_MANT_DIG FLT_MAX
FLT_MAX_10_EXP FLT_MAX_EXP FLT_MIN FLT_MIN_10_EXP FLT_MIN_EXP
FLT_RADIX FLT_ROUNDS fmtflags FOPEN_MAX forward_iterator_tag
FP_FAST_FMA FP_FAST_FMAF FP_FAST_FMAL FP_ILOGB0 FP_ILOGBNAN
FP_INFINITE FP_NAN FP_NORMAL FP_SUBNORMAL FP_ZERO fpos fpos_t
front_insert_iterator fstream goodbit greater greater_equal
gslice_array hex HUGE_VAL HUGE_VALF HUGE_VALL ifstream imaxdiv_t
in indirect_array INFINITY input_iterator_tag insert_iterator
INT_MAX INT_MIN INT_T int_type internal invalid_argument ios
iostate iostream istream istream_iterator istreambuf_iterator
istringstream iterator iterator_category iterator_traits jmp_buf
key_compare key_type L_tmpnam LC_ALL LC_COLLATE LC_CTYPE
LC_MONETARY LC_NUMERIC LC_TIME lconv LDBL_DIG LDBL_EPSILON
LDBL_MANT_DIG LDBL_MAX LDBL_MAX_10_EXP LDBL_MAX_EXP LDBL_MIN
LDBL_MIN_10_EXP LDBL_MIN_EXP ldiv_t left length_error less
less_equal locale logic_error logical_and logical_not logical_or
LONG_MAX LONG_MIN mapped_type mask_array MATH_ERREXCEPT
math_errhandling MATH_ERRNO MB_CUR_MAX MB_LEN_MAX mbstate_t
mem_fun_ref_t mem_fun_t mem_fun1_ref_t mem_fun1_t messages
messages_base messages_byname minus modulus money_base money_get
money_put moneypunct moneypunct_byname multiplies NAN negate
new_handler not_equal_to nothrow nothrow_t NULL num_get num_put
numeric_limits numpunct oct OFF_T off_type offsetof ofstream
openmode ostream ostream_iterator ostreambuf_iterator
ostringstream out out_of_range output_iterator_tag overflow_error
pair plus pointer pointer_to_binary_function
pointer_to_unary_function POS_T pos_type ptrdiff_t RAND_MAX
range_error raw_storage_iterator rebind reference reverse_iterator
right runtime_error SCHAR_MAX SCHAR_MIN scientific SEEK_SET seekdir
sentry showbase showpoint showpos SHRT_MAX SHRT_MIN sig_atomic_t
SIG_DFL SIG_ERR SIG_FPE SIG_IGN SIG_INT SIG_TERM SIGABRT SIGILL
SIGSEGV size_t size_type skipws slice_array STATE_T state_type
stderr stdin stdout streambuf streamoff streampos streamsize
stringbuf stringstream terminate_handler time_get time_get_byname
time_put time_put_byname time_t tm TMP_MAX traits_type trunc
type_info UCHAR_MAX UINT_MAX ULONG_MAX unary_function unary_negate
underflow_error unexpected_handler unitbuf uppercase USHRT_MAX
va_arg va_end va_list va_start valarray value_compare value_type
wcerr WCHAR_MAX WCHAR_MIN wchar_t wcin wclog wcout wctrans_t wctype_t
WEOF wfilebuf wifstream wint_t wios wiostream wistream wistringstream
wostream wostringstream wstreambuf wstreampos wstringbuf wstringstream };
print "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">
<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
<head>
<link rel=\"stylesheet\" type=\"text/css\" href=\"code.css\" />
<title></title>
</head>
<body>
<pre>";
foreach (<STDIN>) {
# Step 1: replace <, >, & by the XML ENTITIES
$_ =~ s/&/&/;
$_ =~ s/</</;
$_ =~ s/>/>/;
$_ =~ s/\x0D|\x0A|\x0D|\x0C|\x{2028}|\x{2029}//;
$_ = "$_\n";
# Step 2: highlight block comments
if ($_ =~ m/\/\*.*/) { $comment++; }
$_ =~ s/(\/\*)/<span class="$comment_class">$1/;
$_ =~ s/(\*\/)/$1<\/span>/;
if ($_ =~ m/\.*\*\//) { $comment--; }
if (!$comment) {
# Step 3: highlight preprocessor instrunctions and line comments
if ($_ =~ m/.*\/\/.*/ || $_ =~ m/#.*/ ) {
$_ =~ s/(.*)(\/\/[^\n]*)([\n]*)/$1<span class="$comment_class">$2<\/span>$3/;
$_ =~ s/(#[^<]*[^\n]*)([\n]*)/<span class="$directive_class">$1<\/span>$2/;
}
else {
# Step 5: highlight strings
$_ =~ s/([^\\])("[^"]*")([^\\])/$1<span class="$string_class">$2<\/span>$3/;
# Step 6: highlight characters
$_ =~ s/([^\\])('[^']*')([^\\])/$1<span class="$character_class">$2<\/span>$3/;
# Step 7: highlight keywords, operators, etc.
foreach my $operator_keyword (@operators_keywords) {
$_ =~ s/([^\w]+)($operator_keyword)([^\w]+|\n)/$1<span class="$keyword_class">$2<\/span>$3/;
}
# Step 8: highlight modifiers and basic types / primitives
foreach my $modifier (@modifiers) {
$_ =~ s/([^\w]+)($modifier)([^\w]+|\n)/$1<span class="$type_class">$2<\/span>$3/;
}
# Step 9: highlight macro, types, classes, defined by language library:
foreach my $library_word (@library_words) {
$_ =~ s/([^\w]+)($library_word)([^\w]+|\n)/$1<span class="$libtype_class">$2<\/span>$3/;
}
# Step 10: highlight functions and methods
$_ =~ s/\.([a-zA-z_]\w*)(\s*)\(/.<span class="$method_class">$1<\/span>$2(/;
$_ =~ s/([a-zA-z_]\w*)(\s*)\(/<span class="$function_class">$1<\/span>$2(/;
}
}
$_ =~ s/(\s+)(<\/[a-zA-z]+>)/$2$1/;
print "$_";
}
print "</pre>
</body>
</html>";
/* code.css */
.keyword, .libtype { font-weight: bold }
.type { color: #900 }
.comment { color: #666; font-style: italic }
.libtype { color: #00f }
.string { color: #f00 }
.character { color: #f0f }
.method, .function { color: #009 }
.directive { color: #090 }