#!/usr/bin/perl
# This line indicates that this script should be executed using Perl, and it specifies the location of the Perl interpreter (`perl`) on the system.
use strict;
use warnings;
# These lines enable strict pragma and warnings, which enforce stricter syntax rules and provide helpful diagnostic messages, respectively, helping to write cleaner and more robust code.
use Encode; # To handle encoding issues
use WWW::Mechanize;
# These lines import the `Encode` module, which is used to handle encoding issues, and the `WWW::Mechanize` module, which provides a convenient way to interact with websites programmatically.
my $filename = 'KoreaDaily_URL.txt';
# This line defines a variable `$filename` and assigns the name of the text file containing the URLs (`KoreaDaily_URL.txt`) to it.
my $mech = WWW::Mechanize->new();
# This line creates a new instance of the `WWW::Mechanize` object, which will be used to interact with web pages.
open(my $fh, '<', $filename) or die "Could not open file '$filename' $!";
# This line opens the text file specified by `$filename` for reading (`'<') and associates a file handle `$fh` with it. If the file cannot be opened, it prints an error message and terminates the script.
my $page_num = 1;
# This line initializes a variable `$page_num` to `1`, which will be used to keep track of the page number while saving HTML files.
while (my $url = <$fh>) {
# This line starts a `while` loop that iterates over each line of the text file associated with the file handle `$fh`. Each line is assigned to the variable `$url`.
chomp $url; # Remove newline character
# This line removes any trailing newline character from the URL read from the file.
$mech->get($url);
# This line instructs the Mechanize object to visit the URL specified by `$url` and retrieve its content.
my $content = $mech->content();
# This line stores the content of the webpage visited by the Mechanize object in the variable `$content`.
my $filename = sprintf("%d.html", $page_num);
# This line constructs a filename for the HTML file to be saved. It uses `sprintf` to format the filename with the current page number.
open(my $fh_out, '>:encoding(EUC-KR)', $filename) or die "Could not open file '$filename' for writing: $!";
# This line opens a new file for writing with the filename specified by `$filename`. It also specifies the encoding as EUC-KR to handle Korean characters. If the file cannot be opened, it prints an error message and terminates the script.
print $fh_out $content;
# This line writes the content of the webpage (`$content`) to the opened file handle `$fh_out`.
close $fh_out;
# This line closes the file handle `$fh_out` after writing the content to the file.
my @pagination_links = $mech->find_all_links(url_regex => qr/page=/i);
# This line finds all pagination links on the current webpage using a regular expression that matches URLs containing "page=". The links are stored in an array `@pagination_links`.
for my $link (@pagination_links) {
# This line starts a `for` loop that iterates over each pagination link found on the webpage.
$mech->get($link->url);
# This line instructs the Mechanize object to click on the pagination link (`$link`) to navigate to the next page.
my $content = $mech->content();
# This line retrieves the content of the newly navigated page.
$page_num++;
# This line increments the `$page_num` variable to keep track of the page number for the next HTML file.
my $filename = sprintf("%d.html", $page_num);
# This line constructs a new filename for the HTML file to be saved, corresponding to the next page number.
open(my $fh_out, '>:encoding(EUC-KR)', $filename) or die "Could not open file '$filename' for writing: $!";
# This line opens a new file for writing with the new filename and EUC-KR encoding.
print $fh_out $content;
# This line writes the content of the newly navigated page to the opened file handle.
close $fh_out;
# This line closes the file handle after writing the content to it.
}
$page_num++;
# This line increments the `$page_num` variable to keep track of the page number for the next iteration of the loop.
}
close($fh);
# This line closes the file handle associated with the input file after processing all URLs.