KoreaDaily Yellowpage Mechanize

#!/usr/bin/perl
# This line indicates that this script should be executed using Perl, and it specifies the location of the Perl interpreter (`perl`) on the system.

use strict;
use warnings;
# These lines enable strict pragma and warnings, which enforce stricter syntax rules and provide helpful diagnostic messages, respectively, helping to write cleaner and more robust code.

use Encode;  # To handle encoding issues
use WWW::Mechanize;
# These lines import the `Encode` module, which is used to handle encoding issues, and the `WWW::Mechanize` module, which provides a convenient way to interact with websites programmatically.

my $filename = 'KoreaDaily_URL.txt';
# This line defines a variable `$filename` and assigns the name of the text file containing the URLs (`KoreaDaily_URL.txt`) to it.

my $mech = WWW::Mechanize->new();
# This line creates a new instance of the `WWW::Mechanize` object, which will be used to interact with web pages.

open(my $fh, '<', $filename) or die "Could not open file '$filename' $!";
# This line opens the text file specified by `$filename` for reading (`'<') and associates a file handle `$fh` with it. If the file cannot be opened, it prints an error message and terminates the script.

my $page_num = 1;
# This line initializes a variable `$page_num` to `1`, which will be used to keep track of the page number while saving HTML files.

while (my $url = <$fh>) {
# This line starts a `while` loop that iterates over each line of the text file associated with the file handle `$fh`. Each line is assigned to the variable `$url`.

    chomp $url;  # Remove newline character
    # This line removes any trailing newline character from the URL read from the file.

    $mech->get($url);
    # This line instructs the Mechanize object to visit the URL specified by `$url` and retrieve its content.

    my $content = $mech->content();
    # This line stores the content of the webpage visited by the Mechanize object in the variable `$content`.

    my $filename = sprintf("%d.html", $page_num);
    # This line constructs a filename for the HTML file to be saved. It uses `sprintf` to format the filename with the current page number.

    open(my $fh_out, '>:encoding(EUC-KR)', $filename) or die "Could not open file '$filename' for writing: $!";
    # This line opens a new file for writing with the filename specified by `$filename`. It also specifies the encoding as EUC-KR to handle Korean characters. If the file cannot be opened, it prints an error message and terminates the script.

    print $fh_out $content;
    # This line writes the content of the webpage (`$content`) to the opened file handle `$fh_out`.

    close $fh_out;
    # This line closes the file handle `$fh_out` after writing the content to the file.

    my @pagination_links = $mech->find_all_links(url_regex => qr/page=/i);
    # This line finds all pagination links on the current webpage using a regular expression that matches URLs containing "page=". The links are stored in an array `@pagination_links`.

    for my $link (@pagination_links) {
    # This line starts a `for` loop that iterates over each pagination link found on the webpage.

        $mech->get($link->url);
        # This line instructs the Mechanize object to click on the pagination link (`$link`) to navigate to the next page.

        my $content = $mech->content();
        # This line retrieves the content of the newly navigated page.

        $page_num++;
        # This line increments the `$page_num` variable to keep track of the page number for the next HTML file.

        my $filename = sprintf("%d.html", $page_num);
        # This line constructs a new filename for the HTML file to be saved, corresponding to the next page number.

        open(my $fh_out, '>:encoding(EUC-KR)', $filename) or die "Could not open file '$filename' for writing: $!";
        # This line opens a new file for writing with the new filename and EUC-KR encoding.

        print $fh_out $content;
        # This line writes the content of the newly navigated page to the opened file handle.

        close $fh_out;
        # This line closes the file handle after writing the content to it.

    }

    $page_num++;
    # This line increments the `$page_num` variable to keep track of the page number for the next iteration of the loop.

}

close($fh);
# This line closes the file handle associated with the input file after processing all URLs.

Leave a Reply

Your email address will not be published. Required fields are marked *