diff options
author | Jauhien Piatlicki (jauhien) <piatlicki@gmail.com> | 2013-08-14 10:20:04 +0200 |
---|---|---|
committer | Jauhien Piatlicki (jauhien) <piatlicki@gmail.com> | 2013-08-14 10:20:04 +0200 |
commit | 8bce5c1e20223170b8569405f214cd266d0d606f (patch) | |
tree | 144a6422e132fd791b874e5d45b383e03581b9cd | |
parent | gs_pypi/pypi_db: fix parsing and store info in database (diff) | |
download | g-sorcery-8bce5c1e20223170b8569405f214cd266d0d606f.tar.gz g-sorcery-8bce5c1e20223170b8569405f214cd266d0d606f.tar.bz2 g-sorcery-8bce5c1e20223170b8569405f214cd266d0d606f.zip |
gs_pypi/pypi_db: ignore errors during package page parsing
-rw-r--r-- | gs_pypi/pypi_db.py | 122 |
1 files changed, 64 insertions, 58 deletions
diff --git a/gs_pypi/pypi_db.py b/gs_pypi/pypi_db.py index ee5c2d5..5db6c59 100644 --- a/gs_pypi/pypi_db.py +++ b/gs_pypi/pypi_db.py @@ -63,66 +63,72 @@ class PypiDBGenerator(DBGenerator): data = {} data["files"] = [] data["info"] = {} - for table in soup("table", class_ = "list")[-1:]: - if not "File" in table("th")[0].string: - continue - - for entry in table("tr")[1:-1]: - fields = entry("td") - - FILE = 0 - URL = 0 - MD5 = 1 - - TYPE = 1 - PYVERSION = 2 - UPLOADED = 3 - SIZE = 4 - - file_inf = fields[FILE]("a")[0]["href"].split("#") - file_url = file_inf[URL] - file_md5 = file_inf[MD5][4:] - - file_type = fields[TYPE].string - file_pyversion = fields[PYVERSION].string - file_uploaded = fields[UPLOADED].string - file_size = fields[SIZE].string - - data["files"].append({"url": file_url, - "md5": file_md5, - "type": file_type, - "pyversion": file_pyversion, - "uploaded": file_uploaded, - "size": file_size}) - - uls = soup("ul", class_ = "nodot") - if uls: - if "Downloads (All Versions):" in uls[0]("strong")[0].string: - ul = uls[1] - else: - ul = uls[0] - - for entry in ul.contents: - if not hasattr(entry, "name") or entry.name != "li": - continue - entry_name = entry("strong")[0].string - if not entry_name: + try: + for table in soup("table", class_ = "list")[-1:]: + if not "File" in table("th")[0].string: continue - if entry_name == "Categories": - data["info"][entry_name] = {} - for cat_entry in entry("a"): - cat_data = cat_entry.string.split(" :: ") - data["info"][entry_name][cat_data[0]] = cat_data[1:] - continue - - if entry("span"): - data["info"][entry_name] = entry("span")[0].string - continue - - if entry("a"): - data["info"][entry_name] = entry("a")[0]["href"] - continue + for entry in table("tr")[1:-1]: + fields = entry("td") + + FILE = 0 + URL = 0 + MD5 = 1 + + TYPE = 1 + PYVERSION = 2 + UPLOADED = 3 + SIZE = 4 + + file_inf = fields[FILE]("a")[0]["href"].split("#") + file_url = file_inf[URL] + file_md5 = file_inf[MD5][4:] + + file_type = fields[TYPE].string + file_pyversion = fields[PYVERSION].string + file_uploaded = fields[UPLOADED].string + file_size = fields[SIZE].string + + data["files"].append({"url": file_url, + "md5": file_md5, + "type": file_type, + "pyversion": file_pyversion, + "uploaded": file_uploaded, + "size": file_size}) + + uls = soup("ul", class_ = "nodot") + if uls: + if "Downloads (All Versions):" in uls[0]("strong")[0].string: + ul = uls[1] + else: + ul = uls[0] + + for entry in ul.contents: + if not hasattr(entry, "name") or entry.name != "li": + continue + entry_name = entry("strong")[0].string + if not entry_name: + continue + + if entry_name == "Categories": + data["info"][entry_name] = {} + for cat_entry in entry("a"): + cat_data = cat_entry.string.split(" :: ") + data["info"][entry_name][cat_data[0]] = cat_data[1:] + continue + + if entry("span"): + data["info"][entry_name] = entry("span")[0].string + continue + + if entry("a"): + data["info"][entry_name] = entry("a")[0]["href"] + continue + + except Exception as error: + print("There was an error during parsing: " + str(error)) + print("Ignoring this package.") + data = {} return data |