135 lines
5.3 KiB
Python
135 lines
5.3 KiB
Python
"""
|
|
Scraper JUCETINS — Junta Comercial do Estado do Tocantins
|
|
URL: https://www.to.gov.br/jucetins/leiloeiros/152aezl6blm0
|
|
Metodo: httpx + BeautifulSoup com parser regex para texto numerado plano
|
|
Estrutura: Texto numerado plano sem tabela, padrao:
|
|
N. Nome Completo
|
|
Matricula no X, de DD/MM/AAAA
|
|
Endereco: ... CEP: XXXXX-XXX, Cidade/TO
|
|
Telefone: (XX) XXXXX-XXXX
|
|
E-mail: email@exemplo.com
|
|
Registros: ~55, todos em pagina unica. Ultima atualizacao: 24/02/2026.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import List, Optional
|
|
|
|
from .base_scraper import AbstractJuntaScraper, Leiloeiro
|
|
|
|
RE_ENTRY_START = re.compile(r"^\d+\.\s+(.+)$")
|
|
RE_MATRICULA = re.compile(r"[Mm]atr[íi]cula\s+n[oº]?\s*[\.\s]*(\d+).*?de\s+(\d{2}/\d{2}/\d{4})", re.IGNORECASE)
|
|
RE_ENDERECO = re.compile(r"[Ee]ndere[çc]o:\s+(.+)", re.IGNORECASE)
|
|
RE_TELEFONE = re.compile(r"[Tt]elefone:\s+(.+)", re.IGNORECASE)
|
|
RE_EMAIL = re.compile(r"[Ee]-?mail:\s+(.+)", re.IGNORECASE)
|
|
RE_CANCELADO = re.compile(r"CANCELAMENTO\s+DE\s+MATR[ÍI]CULA", re.IGNORECASE)
|
|
|
|
|
|
class JucetinsScraper(AbstractJuntaScraper):
|
|
estado = "TO"
|
|
junta = "JUCETINS"
|
|
url = "https://www.to.gov.br/jucetins/leiloeiros/152aezl6blm0"
|
|
|
|
def _parse_text_block(self, text: str) -> List[dict]:
|
|
records = []
|
|
current: Optional[dict] = None
|
|
for raw_line in text.split("\n"):
|
|
line = (raw_line or "").strip()
|
|
if not line:
|
|
continue
|
|
m_start = RE_ENTRY_START.match(line)
|
|
if m_start:
|
|
if current and current.get("nome"):
|
|
records.append(current)
|
|
nome_raw = m_start.group(1).strip()
|
|
is_cancelado = bool(RE_CANCELADO.search(nome_raw))
|
|
nome = RE_CANCELADO.sub("", nome_raw).strip(" -")
|
|
current = {"nome": nome, "municipio": "Palmas", "situacao": "CANCELADO" if is_cancelado else None}
|
|
continue
|
|
if current is None:
|
|
continue
|
|
m = RE_MATRICULA.search(line)
|
|
if m:
|
|
current["matricula"] = m.group(1)
|
|
current.setdefault("data_registro", m.group(2))
|
|
continue
|
|
m = RE_ENDERECO.search(line)
|
|
if m:
|
|
current["endereco"] = m.group(1).strip()
|
|
m_city = re.search(r"([A-Za-záéíóúàãõçÁÉÍÓÚÀÃÕÇ\s]+)/TO", m.group(1))
|
|
if m_city:
|
|
current["municipio"] = m_city.group(1).strip()
|
|
continue
|
|
m = RE_TELEFONE.search(line)
|
|
if m:
|
|
current["telefone"] = m.group(1).strip()
|
|
continue
|
|
m = RE_EMAIL.search(line)
|
|
if m:
|
|
current["email"] = m.group(1).strip()
|
|
continue
|
|
if current and current.get("nome"):
|
|
records.append(current)
|
|
return records
|
|
|
|
async def parse_leiloeiros(self) -> List[Leiloeiro]:
|
|
soup = await self.fetch_page()
|
|
if not soup:
|
|
soup = await self.fetch_page_js(url=self.url, wait_ms=4000)
|
|
if not soup:
|
|
return []
|
|
|
|
results: List[Leiloeiro] = []
|
|
|
|
# Encontra container principal do CMS
|
|
content = soup.select_one(
|
|
".field--name-body, article .content, .node__content, main article, .conteudo, #content .field"
|
|
)
|
|
if not content:
|
|
candidates = sorted(soup.find_all(["div", "article", "section"]),
|
|
key=lambda el: len(el.get_text()), reverse=True)
|
|
content = candidates[0] if candidates else soup.body
|
|
|
|
if content:
|
|
records = self._parse_text_block(content.get_text("\n"))
|
|
if records:
|
|
for r in records:
|
|
results.append(self.make_leiloeiro(**r))
|
|
return results
|
|
|
|
# Fallback tabela
|
|
for table in soup.find_all("table"):
|
|
rows = table.find_all("tr")
|
|
if len(rows) < 2:
|
|
continue
|
|
headers = [self.clean(th.get_text()) for th in rows[0].find_all(["th", "td"])]
|
|
col = {(h or "").lower(): i for i, h in enumerate(headers)}
|
|
|
|
def gcol(cells, frags):
|
|
for k, i in col.items():
|
|
if any(f in k for f in frags) and i < len(cells):
|
|
return self.clean(cells[i].get_text())
|
|
return None
|
|
|
|
for row in rows[1:]:
|
|
cells = row.find_all(["td", "th"])
|
|
if not cells:
|
|
continue
|
|
nome = gcol(cells, ["nome", "leiloeiro"]) or self.clean(cells[0].get_text())
|
|
if not nome or len(nome) < 3:
|
|
continue
|
|
results.append(self.make_leiloeiro(
|
|
nome=nome,
|
|
matricula=gcol(cells, ["matr", "registro"]),
|
|
situacao=gcol(cells, ["situ", "status"]),
|
|
municipio=gcol(cells, ["munic", "cidade"]) or "Palmas",
|
|
telefone=gcol(cells, ["tel", "fone"]),
|
|
email=gcol(cells, ["email"]),
|
|
endereco=gcol(cells, ["ender", "logr"]),
|
|
data_registro=gcol(cells, ["data", "posse"]),
|
|
))
|
|
if results:
|
|
break
|
|
|
|
return results
|