goblaettle/main.go

114 lines
2.9 KiB
Go

package main
import (
"context"
"fmt"
"io"
"log"
"net/http"
"os"
"os/exec"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/chromedp/chromedp"
)
const baseURI = "https://www.rottenacker.de/"
func main() {
// Set the path to the Chromium binary on your Raspberry Pi.
execPath, err := exec.LookPath("chromium-browser")
if err != nil {
log.Fatalf("Chromium binary not found: %v", err)
}
opts := append(chromedp.DefaultExecAllocatorOptions[:],
chromedp.Flag("headless", true),
chromedp.Flag("disable-gpu", true),
chromedp.Flag("no-sandbox", true),
chromedp.Flag("ignore-certificate-errors", true),
chromedp.ExecPath(execPath),
)
ctx, cancel := chromedp.NewExecAllocator(context.Background(), opts...)
defer cancel()
ctx, cancel = chromedp.NewContext(ctx)
defer cancel()
// Navigate to the URL and wait for the page to load.
var responseText string
err = chromedp.Run(ctx,
chromedp.Navigate(baseURI+"cgi-seiten/amtsblatt.htm"), // Replace with the URL you want to crawl.
chromedp.WaitVisible("body", chromedp.ByQuery),
chromedp.Evaluate(`document.documentElement.innerHTML`, &responseText),
)
if err != nil {
log.Fatal(err)
}
// Parse the HTML content using goquery.
doc, err := goquery.NewDocumentFromReader(strings.NewReader(responseText))
if err != nil {
log.Fatal(err)
}
// Find and extract the data you need from the parsed HTML using goquery selectors.
doc.Find("a").Each(func(i int, s *goquery.Selection) {
link, exists := s.Attr("href")
if exists {
nameRAW := strings.Split(link, "/")
name := nameRAW[len(nameRAW)-1]
link = baseURI + strings.Replace(link, "../", "", -1)
downloadFile(link, "mitteilungsblatt_"+name)
}
})
}
func downloadFile(url string, filepath string) error {
if fileExists(filepath) {
fmt.Printf("File %s already exists. Skipping download.\n", filepath)
return nil
}
response, err := http.Get(url)
if err != nil {
return err
}
defer response.Body.Close()
if response.StatusCode != http.StatusOK {
return fmt.Errorf("failed to download file, status code: %d", response.StatusCode)
}
// Determine the file extension based on the Content-Type header
// contentType := response.Header.Get("Content-Type")
// ext := ".txt" // Default extension if Content-Type is not present or not recognized
// if strings.Contains(contentType, "pdf") {
// ext = ".pdf"
// } else if strings.Contains(contentType, "jpeg") {
// ext = ".jpeg"
// } // Add more cases for other file types as needed
// Create the file with the appropriate extension
file, err := os.Create(filepath)
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, response.Body)
if err != nil {
return err
}
fmt.Println("File downloaded successfully!")
return nil
}
func fileExists(filepath string) bool {
_, err := os.Stat(filepath)
return !os.IsNotExist(err)
}