Size: 2158 bytes.


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// cs/ai/gpt/download_wikipedia.cc
#include <filesystem>
#include <iostream>
#include <regex>

#include "cs/ai/gpt/wikipedia.hh"
#include "cs/fs/fs.hh"
#include "cs/log.hh"
#include "cs/parsers/parsers.hh"
#include "cs/result.hh"
#include "cs/util/fmt.hh"

cs::Result DownloadWikipedia(
    unsigned int n, const std::string& out_dir_str) {
  std::filesystem::path out_dir(out_dir_str);
  std::filesystem::create_directories(out_dir);

  SET_OR_RET(auto articles,
             cs::ai::gpt::DownloadMostPopularArticles(n));

  for (size_t i = 0; i < articles.size(); ++i) {
    const std::string& html = articles[i];

    // Extract title from <title> tag
    std::smatch match;
    std::string title =
        "article_" + std::to_string(i + 1);  // fallback
    if (std::regex_search(
            html, match,
            std::regex("<title>([^<]+)</title>"))) {
      title = match[1].str();
      std::transform(title.begin(), title.end(),
                     title.begin(), ::tolower);
      std::replace(title.begin(), title.end(), ' ', '_');
      std::replace(title.begin(), title.end(), '/', '_');
      std::replace(title.begin(), title.end(), ':', '_');
    }

    std::filesystem::path path =
        out_dir / (title + ".html");
    LOG(INFO) << "Writing article to " << path.string()
              << " starting with: \"" << html.substr(0, 30)
              << "...\"" << ENDL;
    OK_OR_RETURN(cs::fs::write(path.string(), html));
    LOG(INFO) << "Wrote " << path.string() << ENDL;
  }

  return cs::Ok("Downloaded " +
                std::to_string(articles.size()) +
                " articles to " + out_dir.string());
}

int main(int argc, char** argv) {
  return cs::Result::Main(
      argc, argv,
      [](std::vector<std::string> args) -> cs::Result {
        if (args.size() != 3) {
          return TRACE(cs::Error(
              FMT("Invalid arguments. Usage: ", args[0],
                  " <N> <OUT_DIR>")));
        }
        SET_OR_RET(unsigned int n,
                   cs::parsers::ParseUnsignedInt(args[1]));
        std::filesystem::path out_dir(args[2]);
        return DownloadWikipedia(n, out_dir.string());
      });
}
v0 (commit) © 2025 @p13i.io | Load balancer proxied to: cs-code-viewer-2:8080 in 4ms.