aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJauhien Piatlicki (jauhien) <piatlicki@gmail.com>2013-08-14 10:20:04 +0200
committerJauhien Piatlicki (jauhien) <piatlicki@gmail.com>2013-08-14 10:20:04 +0200
commit8bce5c1e20223170b8569405f214cd266d0d606f (patch)
tree144a6422e132fd791b874e5d45b383e03581b9cd
parentgs_pypi/pypi_db: fix parsing and store info in database (diff)
downloadg-sorcery-8bce5c1e20223170b8569405f214cd266d0d606f.tar.gz
g-sorcery-8bce5c1e20223170b8569405f214cd266d0d606f.tar.bz2
g-sorcery-8bce5c1e20223170b8569405f214cd266d0d606f.zip
gs_pypi/pypi_db: ignore errors during package page parsing
-rw-r--r--gs_pypi/pypi_db.py122
1 files changed, 64 insertions, 58 deletions
diff --git a/gs_pypi/pypi_db.py b/gs_pypi/pypi_db.py
index ee5c2d5..5db6c59 100644
--- a/gs_pypi/pypi_db.py
+++ b/gs_pypi/pypi_db.py
@@ -63,66 +63,72 @@ class PypiDBGenerator(DBGenerator):
data = {}
data["files"] = []
data["info"] = {}
- for table in soup("table", class_ = "list")[-1:]:
- if not "File" in table("th")[0].string:
- continue
-
- for entry in table("tr")[1:-1]:
- fields = entry("td")
-
- FILE = 0
- URL = 0
- MD5 = 1
-
- TYPE = 1
- PYVERSION = 2
- UPLOADED = 3
- SIZE = 4
-
- file_inf = fields[FILE]("a")[0]["href"].split("#")
- file_url = file_inf[URL]
- file_md5 = file_inf[MD5][4:]
-
- file_type = fields[TYPE].string
- file_pyversion = fields[PYVERSION].string
- file_uploaded = fields[UPLOADED].string
- file_size = fields[SIZE].string
-
- data["files"].append({"url": file_url,
- "md5": file_md5,
- "type": file_type,
- "pyversion": file_pyversion,
- "uploaded": file_uploaded,
- "size": file_size})
-
- uls = soup("ul", class_ = "nodot")
- if uls:
- if "Downloads (All Versions):" in uls[0]("strong")[0].string:
- ul = uls[1]
- else:
- ul = uls[0]
-
- for entry in ul.contents:
- if not hasattr(entry, "name") or entry.name != "li":
- continue
- entry_name = entry("strong")[0].string
- if not entry_name:
+ try:
+ for table in soup("table", class_ = "list")[-1:]:
+ if not "File" in table("th")[0].string:
continue
- if entry_name == "Categories":
- data["info"][entry_name] = {}
- for cat_entry in entry("a"):
- cat_data = cat_entry.string.split(" :: ")
- data["info"][entry_name][cat_data[0]] = cat_data[1:]
- continue
-
- if entry("span"):
- data["info"][entry_name] = entry("span")[0].string
- continue
-
- if entry("a"):
- data["info"][entry_name] = entry("a")[0]["href"]
- continue
+ for entry in table("tr")[1:-1]:
+ fields = entry("td")
+
+ FILE = 0
+ URL = 0
+ MD5 = 1
+
+ TYPE = 1
+ PYVERSION = 2
+ UPLOADED = 3
+ SIZE = 4
+
+ file_inf = fields[FILE]("a")[0]["href"].split("#")
+ file_url = file_inf[URL]
+ file_md5 = file_inf[MD5][4:]
+
+ file_type = fields[TYPE].string
+ file_pyversion = fields[PYVERSION].string
+ file_uploaded = fields[UPLOADED].string
+ file_size = fields[SIZE].string
+
+ data["files"].append({"url": file_url,
+ "md5": file_md5,
+ "type": file_type,
+ "pyversion": file_pyversion,
+ "uploaded": file_uploaded,
+ "size": file_size})
+
+ uls = soup("ul", class_ = "nodot")
+ if uls:
+ if "Downloads (All Versions):" in uls[0]("strong")[0].string:
+ ul = uls[1]
+ else:
+ ul = uls[0]
+
+ for entry in ul.contents:
+ if not hasattr(entry, "name") or entry.name != "li":
+ continue
+ entry_name = entry("strong")[0].string
+ if not entry_name:
+ continue
+
+ if entry_name == "Categories":
+ data["info"][entry_name] = {}
+ for cat_entry in entry("a"):
+ cat_data = cat_entry.string.split(" :: ")
+ data["info"][entry_name][cat_data[0]] = cat_data[1:]
+ continue
+
+ if entry("span"):
+ data["info"][entry_name] = entry("span")[0].string
+ continue
+
+ if entry("a"):
+ data["info"][entry_name] = entry("a")[0]["href"]
+ continue
+
+ except Exception as error:
+ print("There was an error during parsing: " + str(error))
+ print("Ignoring this package.")
+ data = {}
return data