常用漢字をスクレイピングその2。CSVにします。pythonです
実行は
xxxx@xxxxxMacBook-Pro py_test % python3 japanese_kanji.py
みたいに実行してね
import requests
from bs4 import BeautifulSoup
import csv
# Wikipedia URL for 常用漢字一覧
url = "https://ja.wikipedia.org/wiki/%E5%B8%B8%E7%94%A8%E6%BC%A2%E5%AD%97%E4%B8%80%E8%A6%A7"
# Fetch the webpage
response = requests.get(url)
response.raise_for_status()
# Parse the page content
soup = BeautifulSoup(response.content, "html.parser")
# Initialize a list to hold kanji and their stroke counts
kanji_data = []
# Locate the relevant table(s) containing the kanji and their stroke counts
tables = soup.find_all("table", class_="wikitable")
for table in tables:
rows = table.find_all("tr")
for row in rows[1:]: # Skip the header row
columns = row.find_all("td")
if len(columns) >= 5: # Ensure there are enough columns
kanji = columns[1].text.strip() # Second column for 常用漢字
stroke_count = columns[4].text.strip() # Fifth column for 総画
if kanji and stroke_count: # Only append if both fields are non-empty
kanji_data.append([kanji, stroke_count])
# Save the data to a CSV file
output_file = "jo_kanji.csv"
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["漢字", "総画"]) # Header
writer.writerows(kanji_data)
print(f"Data saved to {output_file}")
漢字,総画
亜,7
哀,9
挨,10
愛,13
曖,17
悪,11
握,12
圧,5
扱,6
宛,8
嵐,12
安,6
案,10
暗,13
以,5
衣,6
位,7
囲,7
医,7
依,8……な感じでCSVができます
import requests
from bs4 import BeautifulSoup
import csv
# Wikipedia URL for 常用漢字一覧
url = "https://ja.wikipedia.org/wiki/%E5%B8%B8%E7%94%A8%E6%BC%A2%E5%AD%97%E4%B8%80%E8%A6%A7"
# Fetch the webpage
response = requests.get(url)
response.raise_for_status()
# Parse the page content
soup = BeautifulSoup(response.content, "html.parser")
# Initialize a list to hold kanji, stroke counts, and readings
kanji_data = []
# Locate the relevant table(s) containing the kanji and their stroke counts
tables = soup.find_all("table", class_="wikitable")
for table in tables:
rows = table.find_all("tr")
for row in rows[1:]: # Skip the header row
columns = row.find_all("td")
if len(columns) >= 9: # Ensure there are enough columns
kanji = columns[1].text.strip() # Second column for 常用漢字
stroke_count = columns[4].text.strip() # Fifth column for 総画
readings = columns[8].text.strip() # Ninth column for 音訓
if kanji and stroke_count and readings: # Only append if all fields are non-empty
kanji_data.append([kanji, stroke_count, readings])
# Save the data to a CSV file
output_file = "jo_kanji_with_readings.csv"
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["漢字", "総画", "音訓"]) # Header
writer.writerows(kanji_data)
print(f"Data saved to {output_file}")
おんくんもあるやつ
以上です