常用漢字2136をスクレイピングその２

常用漢字をスクレイピングその２。CSVにします。pythonです
実行は

xxxx@xxxxxMacBook-Pro py_test % python3 japanese_kanji.py
みたいに実行してね

import requests
from bs4 import BeautifulSoup
import csv

# Wikipedia URL for 常用漢字一覧
url = "https://ja.wikipedia.org/wiki/%E5%B8%B8%E7%94%A8%E6%BC%A2%E5%AD%97%E4%B8%80%E8%A6%A7"

# Fetch the webpage
response = requests.get(url)
response.raise_for_status()

# Parse the page content
soup = BeautifulSoup(response.content, "html.parser")

# Initialize a list to hold kanji and their stroke counts
kanji_data = []

# Locate the relevant table(s) containing the kanji and their stroke counts
tables = soup.find_all("table", class_="wikitable")

for table in tables:
    rows = table.find_all("tr")
    for row in rows[1:]:  # Skip the header row
        columns = row.find_all("td")
        if len(columns) >= 5:  # Ensure there are enough columns
            kanji = columns[1].text.strip()  # Second column for 常用漢字
            stroke_count = columns[4].text.strip()  # Fifth column for 総画
            if kanji and stroke_count:  # Only append if both fields are non-empty
                kanji_data.append([kanji, stroke_count])

# Save the data to a CSV file
output_file = "jo_kanji.csv"

with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["漢字", "総画"])  # Header
    writer.writerows(kanji_data)

print(f"Data saved to {output_file}")

漢字,総画

亜,7

哀,9

挨,10

愛,13

曖,17

悪,11

握,12

圧,5

扱,6

宛,8

嵐,12

安,6

案,10

暗,13

以,5

衣,6

位,7

囲,7

医,7

依,8……な感じでCSVができます

import requests
from bs4 import BeautifulSoup
import csv

# Wikipedia URL for 常用漢字一覧
url = "https://ja.wikipedia.org/wiki/%E5%B8%B8%E7%94%A8%E6%BC%A2%E5%AD%97%E4%B8%80%E8%A6%A7"

# Fetch the webpage
response = requests.get(url)
response.raise_for_status()

# Parse the page content
soup = BeautifulSoup(response.content, "html.parser")

# Initialize a list to hold kanji, stroke counts, and readings
kanji_data = []

# Locate the relevant table(s) containing the kanji and their stroke counts
tables = soup.find_all("table", class_="wikitable")

for table in tables:
    rows = table.find_all("tr")
    for row in rows[1:]:  # Skip the header row
        columns = row.find_all("td")
        if len(columns) >= 9:  # Ensure there are enough columns
            kanji = columns[1].text.strip()  # Second column for 常用漢字
            stroke_count = columns[4].text.strip()  # Fifth column for 総画
            readings = columns[8].text.strip()  # Ninth column for 音訓
            if kanji and stroke_count and readings:  # Only append if all fields are non-empty
                kanji_data.append([kanji, stroke_count, readings])

# Save the data to a CSV file
output_file = "jo_kanji_with_readings.csv"

with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["漢字", "総画", "音訓"])  # Header
    writer.writerows(kanji_data)

print(f"Data saved to {output_file}")

おんくんもあるやつ
以上です

コメントを残す コメントをキャンセル

コメントを残すコメントをキャンセル