extract_cnes.py
· 1.8 KiB · Python
Eredeti
#!/usr/bin/env python3
"""Extract CNES hospital table from DATASUS."""
import csv
import sys
import requests
from bs4 import BeautifulSoup
URL = "https://cnes2.datasus.gov.br/Mod_Ind_Unidade_Listar.asp?VTipo=05&VListar=1&VEstado=31&VMun=&VSubUni=&VComp=201412"
OUTPUT = "cnes_hospitais.csv"
def fetch(url: str) -> BeautifulSoup:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
}
r = requests.get(url, headers=headers, timeout=30)
r.encoding = "iso-8859-1"
return BeautifulSoup(r.text, "html.parser")
def extract_table(soup: BeautifulSoup) -> tuple[list[str], list[list[str]]]:
# Target the bordered data table (not layout tables)
table = soup.find("table", {"border": "1"})
if not table:
raise ValueError("Tabela não encontrada na página.")
rows = table.find_all("tr")
if not rows:
raise ValueError("Nenhuma linha encontrada na tabela.")
headers = [th.get_text(strip=True) for th in rows[0].find_all("td")]
data = []
for row in rows[1:]:
cells = [td.get_text(strip=True) for td in row.find_all("td")]
if cells:
data.append(cells)
return headers, data
def save_csv(headers: list[str], data: list[list[str]], path: str) -> None:
with open(path, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(data)
def main() -> None:
print(f"Buscando: {URL}")
soup = fetch(URL)
headers, data = extract_table(soup)
print(f"Colunas: {headers}")
print(f"Linhas: {len(data)}")
save_csv(headers, data, OUTPUT)
print(f"Salvo em: {OUTPUT}")
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"Erro: {e}", file=sys.stderr)
sys.exit(1)
| 1 | #!/usr/bin/env python3 |
| 2 | """Extract CNES hospital table from DATASUS.""" |
| 3 | |
| 4 | import csv |
| 5 | import sys |
| 6 | import requests |
| 7 | from bs4 import BeautifulSoup |
| 8 | |
| 9 | URL = "https://cnes2.datasus.gov.br/Mod_Ind_Unidade_Listar.asp?VTipo=05&VListar=1&VEstado=31&VMun=&VSubUni=&VComp=201412" |
| 10 | OUTPUT = "cnes_hospitais.csv" |
| 11 | |
| 12 | |
| 13 | def fetch(url: str) -> BeautifulSoup: |
| 14 | headers = { |
| 15 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36" |
| 16 | } |
| 17 | r = requests.get(url, headers=headers, timeout=30) |
| 18 | r.encoding = "iso-8859-1" |
| 19 | return BeautifulSoup(r.text, "html.parser") |
| 20 | |
| 21 | |
| 22 | def extract_table(soup: BeautifulSoup) -> tuple[list[str], list[list[str]]]: |
| 23 | # Target the bordered data table (not layout tables) |
| 24 | table = soup.find("table", {"border": "1"}) |
| 25 | if not table: |
| 26 | raise ValueError("Tabela não encontrada na página.") |
| 27 | |
| 28 | rows = table.find_all("tr") |
| 29 | if not rows: |
| 30 | raise ValueError("Nenhuma linha encontrada na tabela.") |
| 31 | |
| 32 | headers = [th.get_text(strip=True) for th in rows[0].find_all("td")] |
| 33 | data = [] |
| 34 | for row in rows[1:]: |
| 35 | cells = [td.get_text(strip=True) for td in row.find_all("td")] |
| 36 | if cells: |
| 37 | data.append(cells) |
| 38 | |
| 39 | return headers, data |
| 40 | |
| 41 | |
| 42 | def save_csv(headers: list[str], data: list[list[str]], path: str) -> None: |
| 43 | with open(path, "w", newline="", encoding="utf-8") as f: |
| 44 | writer = csv.writer(f) |
| 45 | writer.writerow(headers) |
| 46 | writer.writerows(data) |
| 47 | |
| 48 | |
| 49 | def main() -> None: |
| 50 | print(f"Buscando: {URL}") |
| 51 | soup = fetch(URL) |
| 52 | |
| 53 | headers, data = extract_table(soup) |
| 54 | print(f"Colunas: {headers}") |
| 55 | print(f"Linhas: {len(data)}") |
| 56 | |
| 57 | save_csv(headers, data, OUTPUT) |
| 58 | print(f"Salvo em: {OUTPUT}") |
| 59 | |
| 60 | |
| 61 | if __name__ == "__main__": |
| 62 | try: |
| 63 | main() |
| 64 | except Exception as e: |
| 65 | print(f"Erro: {e}", file=sys.stderr) |
| 66 | sys.exit(1) |
| 67 |