2019-03-20 08:53:37 -05:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
##########################################################################
|
|
|
|
#
|
|
|
|
# pgAdmin 4 - PostgreSQL Tools
|
|
|
|
#
|
2024-01-01 02:43:48 -06:00
|
|
|
# Copyright (C) 2013 - 2024, The pgAdmin Development Team
|
2019-03-20 08:53:37 -05:00
|
|
|
# This software is released under the PostgreSQL Licence
|
|
|
|
#
|
|
|
|
##########################################################################
|
|
|
|
|
|
|
|
# This utility will extract SQL keywords from postgres website and
|
|
|
|
# pgsql keywords from the code git paths mentioned in PG_CODES_URLS
|
|
|
|
# Note that, PG_CODES_URLS may need to be changed manually per version change
|
|
|
|
|
|
|
|
import re
|
|
|
|
import requests
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
PG_CODES_URLS = [
|
2022-09-21 04:56:03 -05:00
|
|
|
"https://raw.githubusercontent.com/postgres/postgres/master/src/pl/"
|
|
|
|
"plpgsql/src/pl_scanner.c",
|
2019-03-20 08:53:37 -05:00
|
|
|
]
|
2023-11-02 05:33:50 -05:00
|
|
|
PG_CODES_REGEX = r"PG_KEYWORD\(\"([a-z]*)\"[A-Z_, ]*\)"
|
2019-03-20 08:53:37 -05:00
|
|
|
|
|
|
|
PG_SQL_DOCS_URL = \
|
|
|
|
"https://www.postgresql.org/docs/current/sql-keywords-appendix.html"
|
|
|
|
PG_SQL_DOCS_REGEX = "<[a-z =\"]*>([A-Z_]*)"
|
|
|
|
|
|
|
|
PG_CURRENT_VERSION_URL = "https://www.postgresql.org/docs/current/index.html"
|
|
|
|
PG_CURRENT_VERSION_REGEX = "PostgreSQL ([0-9.]+) Documentation"
|
|
|
|
|
|
|
|
|
|
|
|
def apply_regex(text, regex):
|
|
|
|
return re.findall(regex, text)
|
|
|
|
|
|
|
|
|
|
|
|
def get_file_from_url(url):
|
|
|
|
req = requests.get(url)
|
|
|
|
return req.text
|
|
|
|
|
|
|
|
|
|
|
|
def extract_keywords(text, regex):
|
|
|
|
keywords = apply_regex(text, regex)
|
|
|
|
return [k.lower() for k in keywords]
|
|
|
|
|
|
|
|
|
|
|
|
def get_release_tag(current_url=PG_CURRENT_VERSION_URL,
|
|
|
|
version_regex=PG_CURRENT_VERSION_REGEX):
|
|
|
|
resp_text = get_file_from_url(current_url)
|
|
|
|
version = apply_regex(resp_text, version_regex)
|
2020-08-31 06:15:31 -05:00
|
|
|
if isinstance(version, list):
|
2019-03-20 08:53:37 -05:00
|
|
|
version = version[0]
|
|
|
|
|
|
|
|
return "REL_" + version.replace(".", "_")
|
|
|
|
|
|
|
|
|
|
|
|
def get_keywords_pg_code(file_urls=PG_CODES_URLS,
|
|
|
|
keyword_regex=PG_CODES_REGEX):
|
|
|
|
keywords = []
|
|
|
|
|
|
|
|
# Lets get the latest version first
|
|
|
|
rel_tag = get_release_tag()
|
|
|
|
for file_url in file_urls:
|
|
|
|
if "hb" not in file_url:
|
|
|
|
file_url = file_url + ";hb=" + rel_tag
|
|
|
|
resp_text = get_file_from_url(file_url)
|
|
|
|
|
|
|
|
# Sample entry - PG_KEYWORD("begin", K_BEGIN, RESERVED_KEYWORD)
|
|
|
|
keywords.extend(extract_keywords(resp_text, keyword_regex))
|
|
|
|
|
|
|
|
return keywords
|
|
|
|
|
|
|
|
|
|
|
|
def get_keywords_pg_docs(docs_url=PG_SQL_DOCS_URL,
|
|
|
|
keyword_regex=PG_SQL_DOCS_REGEX):
|
|
|
|
resp_text = get_file_from_url(docs_url)
|
|
|
|
# Sample entry - <code class="token">ABORT</code>
|
|
|
|
keywords = extract_keywords(resp_text, keyword_regex)
|
|
|
|
|
|
|
|
return keywords
|
|
|
|
|
|
|
|
|
|
|
|
def get_all_keywords():
|
|
|
|
final_keywords = set()
|
|
|
|
|
|
|
|
final_keywords.update(get_keywords_pg_code())
|
|
|
|
final_keywords.update(get_keywords_pg_docs())
|
|
|
|
|
|
|
|
return len(final_keywords), " ".join(sorted(list(final_keywords))).strip()
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
args_parser = argparse.ArgumentParser(description="SQL Keywords extractor")
|
|
|
|
args_parser.add_argument(
|
|
|
|
'--total',
|
|
|
|
help="Print with total number of keywords",
|
|
|
|
action="store_true"
|
|
|
|
)
|
|
|
|
args = args_parser.parse_args()
|
|
|
|
|
|
|
|
total, keywords = get_all_keywords()
|
|
|
|
if args.total:
|
|
|
|
print(keywords + "%s\n\n%d keywords extracted." % (keywords, total))
|
|
|
|
else:
|
|
|
|
print(keywords)
|