#!/usr/bin/perl -w # web2words by Thyrst' use Text::Undiacritic qw(undiacritic); use LWP::Simple; use strict; my $file = shift() || die "No file\n"; if ($file =~ /^http:\/\//) { my $pagename = $'; $pagename =~ s/\W/_/g; my $print = get($file) || die "Web page not found\n"; open(NEW, ">", $pagename) || die "Cannot make file\n"; binmode NEW, ":utf8"; print NEW $print; close(NEW); $file = $pagename; } my $text = makeline($file); my @pass = hoover($text); unlink("$file"); creator($file, @pass); ########################### sub makeline { $file = shift(); my $text = "_"; open(PASS, "<", $file) || die "Cannot open file\n"; while () { chomp; next if length($_) < 3; $text = $text."_".$_; } if ($text) { $text = undiacritic($text); $text = lc($text); } else { close(PASS); die "No words\n"; } close(PASS); return $text; } sub hoover { $_ = shift(); s//_$1_/g; s/.*?<\/style(\s*?)>/_/g; s/<.*?>/_/g; s/\W/_/g; s/_\w{1,3}_/_/g; s/__/_/g; @pass = split "_"; if (!@pass) { die "No words\n"; } my %rmsame = map { $_ => 1 } @pass; @pass = sort keys %rmsame; return @pass; } sub creator { ($file, @pass) = @_; open(PASS, ">", $file) || die "error $! \n"; my $to = join ("\n", @pass); print PASS $to; close(PASS); } exit;