#!/usr/bin/env perl use strict; use warnings; use 5.012; # Probably works with lower versions, but not tested on them yet. use PPIx::Regexp; use PPIx::Regexp::Dumper; use Data::Dumper; $Data::Dumper::Useqq = 1; use Getopt::Long; my $USAGE = <<"END_OF_USAGE"; $0 --elms [file ...] This program is a stand-alone Perl 5 to Perl 6 regexp translator. In this, its first incarnation, it has a special (and temporary) --elms option to automatically translate the data exported from http://elm.eu.org/elms/browse_elms.html If no options are given, then it translates each line as it is typed into the console. END_OF_USAGE GetOptions( 'elms' => \( my $elms ), ) or die $USAGE; my ( $DUMP_BEFORE, $DUMP_AFTER ) = ( 0, 0 ); =begin comments This program is a stand-alone Perl 5 to Perl 6 regexp translator. Quickly written due to this conversation: http://irclog.perlgeek.de/perl6/2013-08-07#i_7420578 Written in Perl 5 instead of Perl 6, because Perl 6 does not yet have PPI. TODO: Change . into \N iff /s is in effect. Change ^ and $ into ^^ and $$ iff /m is in effect. Move modifiers from end to beginning. Translate non-alphas into escaped forms. Convert | to || Is this needed? Translate tricks like [ ] and [|] ??? Warn about $1 becoming $0, etc ? Add tests. Integrate with the existing Blue Tiger code. Handle `use re` /flags mode? Probably not. Named capture: Translate qr{(?bar)} into rx{ $=[bar] } . See S05 "Named scalar aliasing to subpatterns" =end comments =cut #say translate_regexp( # "qr{([DEQ].{0,1}[LIM].{2,3}[LIVMF][^P]{2,3}[LMVF].[LMIV].{0,3}[DE])|([DE].{0,1}[LIM].{2,3}[LIVMF][^P]{2,3}[LMVF].[LMIV].{0,3}[DEQ])}" # "qr{foo}smx" # 'qr{["]}' # "qr{[']}" # "qr{[ ]}" # "qr{abc{2,}}" # "qr(abc{2,})" # "qr{ab[-ce]{2,}}" # "qr{ab [c-ef-z]{19,}}" # /x changes ' ' from Token::Whitespace to Token::Literal! # "qr{(?:abc)}" # ?: changes from 'Structure::Capture ( ... )' to 'Structure::Modifier (?: ... )' # 'qr{(?mi)^(?:[a-z]|\d){1,2}(?=\s)}' # Example from S05 #); # From http://elm.eu.org/elms/browse_elms.html , export classes as tsv if ($elms) { open my $fh, '<', 'elm_classes.tsv' or die; while (<$fh>) { chomp; next if /^\s*#/; next if /^"ELMIdentifier"/; # Header line my ( $id, $desc, $regex, $instances, $pdb_instances ) = split "\t"; say 'P5: ', $regex; say 'P6: ', translate_regexp($regex); say ''; } close $fh or warn; } else { while (<>) { chomp; next if /^\s*#/; # Skip commented-out lines. say translate_regexp($_); } } sub translate_regexp { die if @_ != 1; my ($regexp_in_a_string) = @_; my $re = PPIx::Regexp->new($regexp_in_a_string) or die; # Dump RE before changes are made. if ($DUMP_BEFORE) { say $re->source; my $d = PPIx::Regexp::Dumper->new($re) or die; $d->print(); } # qr// becomes re// { my $e1 = $re->first_element; if ( $e1->content eq 'qr' ) { # S05: So you may use parens as your C delimiters, but only if you interpose whitespace my $spaces = ( $e1->next_sibling->delimiters eq '()' ) ? 1 : 0; $e1->{content} = 'rx' . ( ' ' x $spaces ); } } # {2,5} becomes ** 2..5 # {2,} becomes ** 2..* # {,5} becomes ** 0..5 if ( my $sq_aref = $re->find( 'Structure::Quantifier' ) ) { for my $sq ( @{$sq_aref} ) { warn if $sq->start ->content ne '{'; warn if $sq->finish->content ne '}'; $sq->start ->{content} = ' ** '; # The trailing space is important, for {,5} not to become ***..5 $sq->finish->{content} = ' '; my $tl_aref = $sq->find('Token::Literal') or warn "Empty Structure::Quantifier!!!"; # Hmmm. 10,100 is 6 tokens, not 3! if ($tl_aref) { for my $i ( 0 .. $#{$tl_aref} ) { next if $tl_aref->[$i]->content ne ','; warn "XXX Probable bug in your regexp! FOO{,n} is not defined in Perl 5; translating to 0..n anyway\n"; $tl_aref->[$i]->{content} = ( $i == 0 ) ? '0..' : ( $i == $#{$tl_aref} ) ? '..*' : '..' ; last; } } } } # [a-z] becomes <[a..z]> # [-a-z] becomes <-[a..z]> if ( my $scc_aref = $re->find( 'Structure::CharClass' ) ) { for my $scc ( @{$scc_aref} ) { warn if $scc->start ->content ne '['; warn if $scc->finish->content ne ']'; # If this is a CharClass with only 1 char (like PBP recommends instead of escaping), # replace with single-quoted string, or double-quoted if the char is a single-quote. # XXX Are the extra spaces needed? if ( $scc->children == 1 and ref $scc->child(0) eq 'PPIx::Regexp::Token::Literal' ) { my $q = $scc->child(0)->content eq q{'} ? q{"} : q{'}; $scc->start ->{content} = ' ' . $q ; $scc->finish->{content} = $q . ' ' ; next; } $scc->start ->{content} = '<['; $scc->finish->{content} = ']>'; # Handle negated ranges if ( $scc->negated() ) { # The ^ is stored in the type(), for some odd reason. warn if $scc->type->content ne '^'; $scc->start->{content} = '<-['; $scc->type ->{content} = ''; } if ( my $nr_aref = $scc->find( 'Node::Range' ) ) { for my $nr ( @{$nr_aref} ) { my $c1 = $nr->child(1); warn if $c1->content ne '-'; warn if $c1->class ne 'PPIx::Regexp::Token::Operator'; $c1->{content} = '..'; $nr->child( 0)->{content} = ' ' . $nr->child( 0)->{content}; $nr->child(-1)->{content} = $nr->child(-1)->{content} . ' '; } } } } # (?:abc) becomes [abc] if ( my $sm_aref = $re->find( 'Structure::Modifier' ) ) { for my $sm ( @{$sm_aref} ) { # Handling the simple case for now. Will handle more complex cases later; probably with a different approach. next if $sm->type->content ne '?:'; next if %{$sm->type->modifiers}; warn if $sm->start ->content ne '('; warn if $sm->finish->content ne ')'; $sm->start ->{content} = '['; $sm->finish->{content} = ']'; $sm->type ->{content} = ''; } } # (?=\s) becomes # (?<=\s) becomes if ( my $sa_aref = $re->find( 'Structure::Assertion' ) ) { for my $sa ( @{$sa_aref} ) { warn if $sa->start ->content ne '('; warn if $sa->finish->content ne ')'; my $type = $sa->type->content; if ( $type eq '?=' ) { # Positive look-ahead $sa->start ->{content} = '<'; $sa->finish->{content} = '>'; $sa->type ->{content} = '?before '; } elsif ( $type eq '?<=' ) { # Positive look-behind $sa->start ->{content} = '<'; $sa->finish->{content} = '>'; $sa->type ->{content} = '?after '; } else { warn "Don't yet know how to handle a Structure::Assertion if type '$type':\n ", Dumper $sa; } } } if ($DUMP_AFTER) { my $d = PPIx::Regexp::Dumper->new($re) or die; $d->print(); } # Use ->content instead of ->source, in order to reflects the changes we have made to the tree. return $re->content; } __END__ Relevant snippents from S05-regex.pod : =item * The new C<:Perl5>/C<:P5> modifier allows Perl 5 regex syntax to be used instead. (It does not go so far as to allow you to put your modifiers at the end.) For instance, m:P5/(?mi)^(?:[a-z]|\d){1,2}(?=\s)/ is equivalent to the Perl 6 syntax: m/ :i ^^ [ <[a..z]> || \d ] ** 1..2 / (?mi) ^ [<[ a..z ]>|\d] ** 1..2 (?=\s)} =item * The Perl 5 C regex constructor is gone. =item * The Perl 6 equivalents are: regex { pattern } # always takes {...} as delimiters rx / pattern / # can take (almost) any chars as delimiters You may not use whitespace or alphanumerics for delimiters. Space is optional unless needed to distinguish from modifier arguments or function parens. So you may use parens as your C delimiters, but only if you interpose whitespace: rx ( pattern ) # okay rx( 1,2,3 ) # tries to call rx function (This is true for all quotelike constructs in Perl 6.) The C form may be used directly as a pattern anywhere a normal C match can. The C form is really a method definition, and must be used in such a way that the grammar class it is to be used in is apparent.