diff --git a/LICENSE b/LICENSE index a2b995d..a4831ac 100644 --- a/LICENSE +++ b/LICENSE @@ -568,4 +568,26 @@ Apache License distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file + limitations under the License. + + MIT License + +Copyright (c) 2023 Arik Reuter + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/.readthedocs.yml b/LLMTopicDetection_TopicGPT/.readthedocs.yml new file mode 100644 index 0000000..a50d762 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/.readthedocs.yml @@ -0,0 +1,9 @@ +version: 2 + +sphinx: + configuration: docs/source/conf.py + +python: + version: 3.8 + install: + - requirements: docs/requirements.txt \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/MANIFEST.in b/LLMTopicDetection_TopicGPT/MANIFEST.in new file mode 100644 index 0000000..187361e --- /dev/null +++ b/LLMTopicDetection_TopicGPT/MANIFEST.in @@ -0,0 +1,4 @@ +# MANIFEST.in + +include README.md +recursive-include quicktests * \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/README.md b/LLMTopicDetection_TopicGPT/README.md new file mode 100644 index 0000000..ec4869f --- /dev/null +++ b/LLMTopicDetection_TopicGPT/README.md @@ -0,0 +1,375 @@ +# TopicGPT +TopicGPT integrates the remarkable capabilities of current LLMs such as GPT-3.5 and GPT-4 into topic modelling. + +While traditional topic models extract topics as simple lists of top-words, such as ["Lion", "Leopard", "Rhino", "Elephant", "Buffalo"], TopicGPT offers rich and dynamic topic representations that can be intuitively understood, extensively investigated and modified in various ways via a simple text commands in natural language. + +More specifically, it provides the following core functionalities: +- Identification of clusters within document-embeddings and top-word extraction +- Generation of informative topic descriptions +- Extraction of detailed information about topics via Retrieval-Augmented-Generation (RAG) +- Comparison of topics +- Splitting and combining of identified topics +- Addition of new topics based on keywords +- Deletion of topics + +When directly interacting with TopicGPT via prompting and without explicitly calling functions, an LLM autonomously decides which functionality to use. + +## Paper + +To read more about the model, checkout the corresponding [paper](https://arxiv.org/abs/2403.03628): https://arxiv.org/abs/2403.03628 + +## Installation + +You can install TopicGPT via [PyPI](https://pypi.org/project/topicgpt/) + +``` +pip install topicgpt +``` + +## Further Documentation + +You can find detailed documentation of the available classes and functions [here](https://lmu-seminar-llms.github.io/TopicGPT/). + + +## Example + +The following short example demonstrates how TopicGPT could be used on a real-world dataset. The Twenty Newsgroups corpus (https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) is used for this purpose. + +Further example-notebooks can be found under examples/ in the repository. + +### Load the data + +```python +from sklearn.datasets import fetch_20newsgroups + +data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) #download the 20 Newsgroups dataset +corpus = data['data'] + +corpus = [doc for doc in corpus if doc != ""] #remove empty documents +``` +### Initialize the model + +Note that an OpenAi API-Key is needed to compute the embeddings and execute the prompts. See https://platform.openai.com/account/api-keys for more details. We select 20 topics in this case since the Twenty Newsgroups corpus comprises documents from 20 different newsgroups. It is also possible to let Hdbscan determine the number of topics automatically. + +```python +from topicgpt.TopicGPT import TopicGPT + +tm = TopicGPT( + api_key = , + n_topics = 20 # select 20 topics since the true number of topics is 20 +) + +# Or, to use with Azure +tm = TopicGPT( + api_key = , + azure_endpoint = { + "endpoint": , + "api_version": + }, + n_topics = 20 +) +``` + +### Fit the model + +The fit-method fits the model. This can take, depending on the size of the dataset and wether embeddings have been provided, from a few minutes to several hours. Especially the computation of the embeddings can take some time. + +```python +tm.fit(corpus) # the corpus argument should be of type list[str] where each string represents one document +``` + +### Inspect the found topics + +Obtain an overview over the indentified topics +```python +print(tm.topic_lis) +``` +_Output_ +``` +[Topic 0: Electronics Equipment Sales, + Topic 1: Image Processing, + Topic 2: Gun control, + Topic 3: Online Privacy and Anonymity, + Topic 4: Conflict and Violence., + Topic 5: Computer Hardware, + Topic 6: Belief and Atheism, + Topic 7: Online Discussions, + Topic 8: Computer Software, + Topic 9: Car Features and Performance, + Topic 10: Encryption and Government, + Topic 11: Technology and Computing., + Topic 12: Technology and Computing, + Topic 13: Space Exploration, + Topic 14: Motorcycle Riding Techniques, + Topic 15: Technology, + Topic 16: Hockey Games, + Topic 17: Health and Medicine., + Topic 18: Baseball games and teams., + Topic 19: Beliefs about Homosexuality.] +``` +To obtain more detailed information on each topic, we can call the "print_topics" method: + +```python +tm.print_topics() +``` +_Output_ +``` +Topic 0: Electronics Equipment Sales + +Topic_description: The common topic of the given words appears to be "electronics and technology". + +Various aspects and sub-topics of this topic include: +1. Buying and selling: "offer", "sale", "sell", "price", "buy" +2. Device usage and features: "use", "get", "new", "used", "condition" +3. Technical specifications: "wire", "ground", "power", "circuit", "voltage" +4. Communication and connectivity: "phone", "email", "modem", "wireless", "connection" +5. Accessories and peripherals: "battery", "cable", "manuals", "disk", "monitor" +Top words: ["n't", 'one', 'would', 'use', 'like', 'get', 'new', 'used', 'offer', 'sale'] + +[...] +``` +We can also visualize the resulting clusters to get an overview of the shape and size of the clusters +``` +tm.visualize_clusters() +``` + +### Find out more detailed information about the identified topics + +First, we might be interested in knowing what information the space topic (topic 13) contains on the moon landing. + +```python +tm.pprompt("Which information related to the keyword 'moon landing' does topic 13 have?") +``` + +_Output_ +``` +GPT wants to the call the function: { + "name": "knn_search", + "arguments": "{\n \"topic_index\": 13,\n \"query\": \"moon landing\",\n \"k\": 5\n}" +} +Topic 13, which is related to the keyword "moon landing," has the following information: + +1. Document index 258: This document provides an introduction to the solar system and mentions that advancements in rocketry after World War II enabled machines to travel to the Moon and other planets. It highlights that the United States has sent both automated spacecraft and human-crewed expeditions to explore the Moon. + +2. Document index 535: This document discusses a $65 million program called the Back to the Moon bill, which aims to encourage private companies to develop lunar orbiters. It mentions that there is a chance of making a lunar mission happen in this decade through this program. + +3. Document index 357: This document is a request for more information on a recent newspaper article about the Japanese crashing or crash-landing a package on the Moon. It indicates that the article was vague and unclear. + +4. Document index 321: This document speculates about what would have happened if the Soviets had beaten the United States in the Moon race. It suggests that the US would have still performed Moon landings and potentially set up a lunar base. The focus on Mars exploration would have depended on the Soviets' actions. + +5. Document index 102: This document mentions the Hiten engineering-test mission, which spent time in a highly eccentric Earth orbit and performed lunar flybys before being inserted into lunar orbit using gravity-assist-like maneuvers. It states that the mission was expected to crash on the Moon eventually. + +Please note that the above summaries are based on the content of the documents and may not capture all the information contained within them. +``` + +From this output we see that an instance of a GPT decided to call the function "knn_search" from the class "TopicPrompting". Indeed some documents on the topic "moon landing" have been found and the model summarizes the relevant information accordingly. + +If we want to check, for instance the document with index 102 in topic 13 to learn more about the Hiten engineering-test mission, we can simply do the following: + +```python +print(tm.topic_lis[13].documents[535]) +``` +_Output_ +``` +Their Hiten engineering-test mission spent a while in a highly eccentric Earth orbit doing lunar flybys, and then was inserted into lunar orbit using some very tricky gravity-assist-like maneuvering. This meant that it would crash on the Moon eventually, since there is no such thing as a stable lunar orbit (as far as anyone knows), and I believe I recall hearing recently that it was about to happen. +``` + +### Split Topics +We find that topic 6 about Belief and Atheism is a bit general and would thus like to split see what 5 potential subtopics it contains: + +```python +tm.pprompt("What are 5 potential subtopics of topic 6") +``` + +_Output_ + +``` +GPT wants to the call the function: { + "name": "split_topic_kmeans", + "arguments": "{\n \"topic_idx\": 6,\n \"n_clusters\": 5\n}" +} + +Here are five potential subtopics of topic 6: + +1. Existence of God: This subtopic explores arguments, evidence, proofs, and the existence of God. + +2. Atheism vs Theism: This subtopic delves into the concepts of atheism, theism, atheists, and theists, and the debates surrounding these belief systems. + +3. Belief and Faith: This subtopic focuses on beliefs, faith, believers, and the roles they play in religious and philosophical contexts. + +4. Rationality and Reasoning: This subtopic involves discussions on reason, rationality, logic, and argumentation as they relate to beliefs and debates about religion. + +5. Morality and Ethics: This subtopic explores moral values, ethics, morals, and how they influence and are influenced by religious beliefs and debates. + +These subtopics were inferred from the words provided and the descriptions of the topics in the corpus used for topic modeling. + +``` +Based on those results we decide to not actually split the topic into 5 subtopics but rather just 3 - One on "religious faith", one on "atheism" and one on "ethics and philosophy". +We also want to actually make those changes to the existing topics, so we tell TopicGPT to do the changes inplace. + +```python +tm.pprompt("Please split topic 6 into subtopics based on the keywords 'religious faith', 'atheism' and 'ethics and philosophy'. Do this inplace.") +``` +_Output_ + +``` +I have split topic 6 into three subtopics based on the provided keywords: "religious faith", "atheism", and "ethics and philosophy". Here are the new subtopics: + +1. Topic 21: Religious Beliefs + - This topic encompasses the general theme of "Religion and Belief". + - Subtopics and aspects of this topic include: + - Faith and Beliefs: Believing, belief, true, evidence, truth. + - Religious Practices: Praying, heretic, bible, church. + - Religious Communities: Christian, Muslim, atheist. + - Interpretation and Understanding: Interpretations, interpretation, understanding, context. + - Morality and Ethics: Good, wrong, punishment, violence. + +2. Topic 22: Existence of God + - This topic focuses on "Atheism and Belief". + - Subtopics and aspects of this topic include: + - Arguments and Evidence: Atheists provide evidence for their beliefs. + - Existence of God: Atheists question the existence of God. + - Religious Beliefs: Atheists challenge religious beliefs. + - Science and Reason: Atheists rely on science and reason. + - Morality and Ethics: Atheists discuss morality and ethics. + +3. Topic 23: Ethics and Morality + - This topic revolves around "Morality and Ethics". + - Subtopics and aspects of this topic include: + - Moral reasoning: "Argument," "reasoning," "logic," "fallacy," "premises." + - Ethical principles: "Right," "wrong," "immoral," "morals," "ethical." + - Moral judgments: "Moral," "ethical," "morality," "judgments," "values." + - Human behavior: "Homosexual," "sexual," "promiscuous," "behavior," "sex." + - Scientific perspective: "Science," "scientific," "evidence," "empirical," "studies." +Please note that these subtopics are based on the keywords provided and may not cover all aspects of the original topic. +``` + +We see that TopicGPT performed the splitting as intended. However, the names and descriptions of the topics got changed slightly in order to optimally fit to the documents that actually constitute the topics based on the keywords. (If you like you can rename them at any time). + +### Combining topics + +Since topics 15 ("Hockey Games") and 17 ("Baseball games and teams") are both about sports, we want to combine them into a single topic. + +```python +tm.pprompt("Please combine topics 15 and 17. Do this inplace.") +``` +_Output_ + +``` +GPT wants to the call the function: { + "name": "combine_topics", + "arguments": "{\n \"topic_idx_lis\": [15, 17],\n \"inplace\": true\n}" +} +The topics 15 and 17 have been combined into a new topic called "Sports". This topic includes aspects and sub-topics related to sports such as team and players, games and seasons, performance and skills, fans and audience, and statistics and records. Some of the common words found in this topic include "team," "players," "hockey," "baseball," "game," "games," "season," "playoffs," "good," "better," "win," "hit," "score," "fans," "series," "watch," "fan," "stats," "record," "pts," and "career". +``` + +### Saving and Reusing Embeddings + +After generating embeddings with `tm.fit(corpus)`, save them with `tm.save_embeddings()`. By default, they are stored in `SavedEmbeddings/embeddings.pkl`. Enable reuse by setting `use_saved_embeddings=True` in `TopicGPT` initialization. + +```python +tm.fit(corpus) +tm.save_embeddings() # Default path + +# Reuse saved embeddings +tm2 = TopicGPT(use_saved_embeddings=True) + +# For a custom path: +tm.save_embeddings(path='your/custom/path.pkl') +tm3 = TopicGPT(use_saved_embeddings=True, path_saved_embeddings='your/custom/path.pkl') +``` + +This approach saves time by avoiding re-calculation of embeddings for large datasets. + + +## Limitations and Caveats + +It is important to note that, as a model built on top of inherently stochastic LLMs and all their shortcomings, TopicGPT has several limitations and shortcomings as well. LLMs are Machine Learning models and as such, they are not perfect at solving the intended tasks; They may be useful because they are correct reasonably often, but they can always fail. The following list is not complete, but may provide useful information on what may go wrong when using TopicGPT: + +- **Hallucination**: LLMs are well known for yielding incorrect but coherent and plausible answers that seem convincing but are actually just made up. Although we tried to minimize this undesired behavior through carefully designing the used prompts, we found that TopicGPT may hallucinate (especially) with respect to the following aspects: + - Making up, distorting or misinterpreting content of documents retrieved via knn-search. + - Incorrectly naming and describing topics based on top-words. Specifically, the model can identify topics that seem coherent and reasonable although the corresponding documents are not actually related. + +- **Unsdesired Behaviour**: When using the "prompt" or "pprompt" function, TopicGPT may not call the function you intended it to call. This can be alleviated by explicitly telling the model which function to use or directly calling the function yourself. It sometimes also tires to call invalid functions or functions with invalid arguments. + +- **Stoachasticity**: The behavior of TopicGPT is not deterministic and exhibits some randomness. There is always some probability that certain actions do not work as intended at the first try because some components of the LLM do not function as desired. Simply trying again should mostly help with those issues. + - On the other hand, TopicGPT may also be overly cautious and report that no relevant information has been found or no topic exists that matches a certain keyword even though it does. This could be caused by designing prompts to prevent massive occurrence of falsely positive results. + Note that using GPT-4 in TopicGPT can help to significantly alleviate issues with hallucination. + +- **Erroneous embeddings**: The document- and word-embeddings used in TopicGPT may not always reflect the actual semantics of the texts correctly. More specifically, the embeddings sometimes reflect, for instance, grammatical or orthographical aspects such that clusters based on those aspects may be identified. + +- **Size of the dataset**: TopicGPT might fail when the dataset is too small (less than 1000 documents). This is because then the identified topics might become very small and noisy. The RAG aspect will also likely not work as intended. Datasets of more than 10,000 documents are recommended. Note that the processing of very large datasets might not fit into the main memory of your computer. + + +## Tips and tricks for prompting TopicGPT +When using the "pprompt" or "prompt" function, TopicGPT can behave differently than intended. To alleviate those issues some simple tricks can help: + +- Explicitly tell the model which function it should use and which parameters to select. (Sometimes the model simply cannot know what you expect it to do.) For example, instead of using ```tm.pprompt("What are the subtopic of topic 13?")```, use something like ```tm.pprompt("What are the subtopic of topic 13? Please use the function that uses the k-means algorithm to split the topic. Use a parameter of k = 5 and do this inplace")``` + +- Just ask the same prompt again. Since TopicGPT is a stochastic system, calling the same function with the same argument again might yield a different functionality to be used or a different result. + +- If this doesn't help, you can also directly call the function you want to use from the TopicPrompting class. In the example above you could do ```tm.topic_prompting.split_topic_kmeans(topic_idx = 13, n_clusters = 5, inplace = True)```. Note that all functions the model can call can also be called directly. + +- In case of hallucination of facts it may help to use GPT-4 for TopicGPT + +## How TopicGPT works + +TopicGPT is centrally built on top of text embeddings and the prompting mechanisms obtained via LLMs and provided by the OpenAI API. Please also see the section [References](#references) for more details on the models and ideas used in TopicGPT. + +### Embeddings +When no embeddings are provided, TopicGPT automatically computes the embeddings of the documents of the provided corpus and also of the vocabulary that is extracted from the corpus. This happens after the fit-method is called. + +The class ```GetEmbeddingsOpenAI``` is used for this purpose. + +### Clustering +In order to identify topics among the documents, TopicGPT reduces the dimensionality of the document embeddings via UMAP and then uses Hdbscan to identify the clusters. Dimensionality reduction is necessary since the document embeddings are of very high dimensionality and thus the curse of dimensionality would make it very difficult, if not impossible, to identify the clusters. + +When not specifying the number of topics in the ```Topic GPT``` class, Hdbscan is used to automatically determine the number of topics. If the number of topics is specified, agglomerative clustering is used on top of the clusters identified by HDBSCAN. + +The class ```Clustering``` is used for this purpose. + +### Extraction of Top-Words + +After the clusters have been identified, TopicGPT extracts the top-words of each topic. This is done via two different methods: +- **Tf-idf**: The tf-idf method is based on the idea that words that occur frequently in a topic but rarely in other topics are good indicators for the topic. The top-words are thus the words with the highest tf-idf scores. +- **Centroid similarity**: The centroid similarity method is based on the idea that the words that are closest to the centroid of a topic are good indicators for the topic. The top-words are thus the words that are closest to the centroid of the topic. + +Note that the Tf-idf heuristic was introduced for the BerTopic Model (Grootendorst, Maarten. "BERTopic: Neural topic modeling with a class-based TF-IDF procedure." arXiv preprint arXiv:2203.05794 (2022)) and a similar idea to the centroid similarity method is used in Top2Vec (Angelov, Dimo. "Top2vec: Distributed representations of topics." arXiv preprint arXiv:2008.09470 (2020)). + +Topword extraction is performed with help of the class ```ExtractTopWords```. + +### Describing and naming topics + +In the next step, all topics are provided with a short name and a description. This is done via prompting an LLM provided by OpenAI with around 500 top-words of each topic. The LLM then generates a short name and a description for each topic. + +The class ```TopwordEnhancement``` is used for this purpose. + + +Note that computation of Embeddings, Extraction of Top-Words and Describing and Naming Topics are all performed when calling the ```fit``` method of the ```TopicGPT``` class. + +### Prompting of TopicGPT + +When formalizing a prompt via the ```pprompt``` or ```prompt``` function, TopicGPT uses the following steps: + +1. The prompt, together with basic model- and corpus-information, is sent to an LLM provided by OpenAI. The LLM then decides which function of the ```TopicPrompting``` class to call. The LLM also decides which arguments to use for the function. +2. The function is called according to the information by the LLM. The full result of the function will be returned to the user. +3. Parts of the results of the function are returned to the LLM. The LLM then generates a short answer of the original prompt with help of the function result and returns it to the user. + + +## References + +The following models, software packages and ideas are central for TopicGPT: + +- **UMAP**: The Uniform Manifold Approximation and Projection for Dimension Reduction algorithm is used for reducing the dimensionality of document- and word embeddings (McInnes, Leland, John Healy, and James Melville. "Umap: Uniform manifold approximation and projection for dimension reduction." arXiv preprint arXiv:1802.03426 (2018).) + +- **HDBSCAN**: Hierarchical density based clustering is used to identify the clusters among the dimensionality reduced topics (McInnes, Leland, John Healy, and Steve Astels. "hdbscan: Hierarchical density based clustering." J. Open Source Softw. 2.11 (2017): 205.) + +- **Agglomerative Clustering**: The agglomerative clustering functionality from sklearn is used to combine topics in case the identified number of clusters exeeds the number of topics specified by the user (Pedregosa, Fabian, et al. "Scikit-learn: Machine learning in Python." the Journal of machine Learning research 12 (2011): 2825-2830., https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html) + +- **Topword extraction**: Even though the corresponding packages are not directly used, the topword extraction methods used for this package are based on very similar ideas as found in the BerTopic Model (Grootendorst, Maarten. "BERTopic: Neural topic modeling with a class-based TF-IDF procedure." arXiv preprint arXiv:2203.05794 (2022)) in the case of the tf-idf method and in Top2Vec for the centroid-similarity method (Angelov, Dimo. "Top2vec: Distributed representations of topics." arXiv preprint arXiv:2008.09470 (2020)). + +- **LLMs from the GPT family**: Some references for the models for computing embeddings and answering the prompts include: + - Brown, Tom B., et al. “Language Models are Few-Shot Learners.” Advances in Neural Information Processing Systems 33 (2020). + - Radford, Alec, et al. “GPT-4: Generative Pre-training of Transformers with Discrete Latent Variables.” arXiv preprint arXiv:2302.07413 (2023). + - Radford, Alec, et al. “Improving Language Understanding by Generative Pre-Training.” URL: https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf. [6] + - Radford, Alec, et al. “Language Models are Unsupervised Multitask Learners.” OpenAI Blog 1.8 (2019): 9. [7] diff --git a/LLMTopicDetection_TopicGPT/docs/Makefile b/LLMTopicDetection_TopicGPT/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/LLMTopicDetection_TopicGPT/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/LLMTopicDetection_TopicGPT/docs/README.rst b/LLMTopicDetection_TopicGPT/docs/README.rst new file mode 100644 index 0000000..7c97f14 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/docs/README.rst @@ -0,0 +1,34 @@ +============== +TopicGPT +============== + +TopicGPT integrates the remarkable capabilities of current LLMs such as GPT-3.5 and GPT-4 into topic modeling. + +While traditional topic models extract topics as simple lists of top-words, such as ["Lion", "Leopard", "Rhino", "Elephant", "Buffalo"], TopicGPT offers rich and dynamic topic representations that can be intuitively understood, extensively investigated and modified in various ways via simple text commands. + +More specifically, it provides the following core functionalities: + +- Identification of clusters within document-embeddings and top-word extraction +- Generation of informative topic descriptions +- Extraction of detailed information about topics via Retrieval-Augmented-Generation (RAG) +- Comparison of topics +- Splitting and combining of identified topics +- Addition of new topics based on keywords +- Deletion of topics + +It is further possible to directly interact with TopicGPT via prompting and without explicitly calling functions - an LLM autonomously decides which functionality to use. + +Installation Guide +------------------ + +To install TopicGPT, simply use PyPI: + +.. code-block:: bash + + pip install topicgpt + +GitHub Repository +----------------- + +For more details, usage examples, source code, and testing procedures, please visit the TopicGPT GitHub repository: https://github.com/LMU-Seminar-LLMs/TopicGPT + diff --git a/LLMTopicDetection_TopicGPT/docs/README_long.rst b/LLMTopicDetection_TopicGPT/docs/README_long.rst new file mode 100644 index 0000000..57b33ab --- /dev/null +++ b/LLMTopicDetection_TopicGPT/docs/README_long.rst @@ -0,0 +1,433 @@ +TopicGPT +======== + +TopicGPT integrates the remarkable capabilities of current LLMs such as GPT-3.5 and GPT-4 into topic modeling. + +While traditional topic models extract topics as simple lists of top-words, such as ["Lion", "Leopard", "Rhino", "Elephant", "Buffalo"], TopicGPT offers rich and dynamic topic representations that can be intuitively understood, extensively investigated and modified in various ways via simple text commands. + +More specifically, it provides the following core functionalities: + +- Identification of clusters within document-embeddings and top-word extraction +- Generation of informative topic descriptions +- Extraction of detailed information about topics via Retrieval-Augmented-Generation (RAG) +- Comparison of topics +- Splitting and combining of identified topics +- Addition of new topics based on keywords +- Deletion of topics + +It is further possible to directly interact with TopicGPT via prompting and without explicitly calling functions - an LLM autonomously decides which functionality to use. + + +GitHub Repository +---------------- + +You can find the source code and related materials for this project in the GitHub repository: + +- [TopicGPT](https://github.com/LMU-Seminar-LLMs/TopicGPT/tree/dev) + + + + + +Installation +------------ + +You can install topicgpt via `PyPI ` + +:: + + pip install topicgpt + + +Example +======= + +The following example demonstrates how TopicGPT can be used on a real-world dataset. The Twenty Newsgroups corpus (`Twenty Newsgroups Corpus Documentation `_) is used for this purpose. + +Load the data +------------- + +.. code-block:: python + + from sklearn.datasets import fetch_20newsgroups + + data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) #download the 20 Newsgroups dataset + corpus = data['data'] + + corpus = [doc for doc in corpus if doc != ""] #remove empty documents + +Initialize the model +-------------------- + +Note that an OpenAi API-Key is needed to compute the embeddings and execute the prompts. See `OpenAI API Keys Documentation `_ for more details. We select 20 topics in this case since the Twenty Newsgroups corpus comprises documents from 20 different newsgroups. It is also possible to let Hdbscan determine the number of topics automatically. + +.. code-block:: python + + from topicgpt.TopicGPT import TopicGPT + + tm = TopicGPT( + openai_api_key = , + n_topics = 20 # select 20 topics since the true number of topics is 20 + ) + + +Fit the model +------------ + +The fit-method fits the model. This can take, depending on the size of the dataset and whether embeddings have been provided, from a few minutes to several hours. Especially the computation of the embeddings can take some time. + +.. code-block:: python + + tm.fit(corpus) # the corpus argument should be of type list[str] where each string represents one document + +Inspect the found topics +------------------------ + +Obtain an overview of the identified topics. + +.. code-block:: python + + print(tm.topic_lis) + + Output: + + .. code-block:: plaintext + + [Topic 0: Electronics Equipment Sales, + Topic 1: Image Processing, + Topic 2: Gun control, + Topic 3: Online Privacy and Anonymity, + Topic 4: Conflict and Violence., + Topic 5: Computer Hardware, + Topic 6: Belief and Atheism, + Topic 7: Online Discussions, + Topic 8: Computer Software, + Topic 9: Car Features and Performance, + Topic 10: Encryption and Government, + Topic 11: Technology and Computing., + Topic 12: Technology and Computing, + Topic 13: Space Exploration, + Topic 14: Motorcycle Riding Techniques, + Topic 15: Technology, + Topic 16: Hockey Games, + Topic 17: Health and Medicine., + Topic 18: Baseball games and teams., + Topic 19: Beliefs about Homosexuality.] + +To obtain more detailed information on each topic, we can call the "print_topics" method: + +.. code-block:: python + + tm.print_topics() + + Output: + + .. code-block:: plaintext + + Topic 0: Electronics Equipment Sales + + Topic_description: The common topic of the given words appears to be "electronics and technology". + + Various aspects and sub-topics of this topic include: + 1. Buying and selling: "offer", "sale", "sell", "price", "buy" + 2. Device usage and features: "use", "get", "new", "used", "condition" + 3. Technical specifications: "wire", "ground", "power", "circuit", "voltage" + 4. Communication and connectivity: "phone", "email", "modem", "wireless", "connection" + 5. Accessories and peripherals: "battery", "cable", "manuals", "disk", "monitor" + Top words: ["n't", 'one', 'would', 'use', 'like', 'get', 'new', 'used', 'offer', 'sale'] + + [...] + +We can also visualize the resulting clusters to get an overview of the shape and size of the clusters. + +.. code-block:: plaintext + + tm.visualize_clusters() + +Find out more detailed information about the identified topics +------------------------------------------------------------ + +First, we might be interested in knowing what information the space topic (topic 13) contains on the moon landing. + +.. code-block:: python + + tm.pprompt("Which information related to the keyword 'moon landing' does topic 13 have?") + + Output: + + .. code-block:: plaintext + + GPT wants to call the function: { + "name": "knn_search", + "arguments": "{\n \"topic_index\": 13,\n \"query\": \"moon landing\",\n \"k\": 5\n}" + } + Topic 13, which is related to the keyword "moon landing," has the following information: + + 1. Document index 258: This document provides an introduction to the solar system and mentions that advancements in rocketry after World War II enabled machines to travel to the Moon and other planets. It highlights that the United States has sent both automated spacecraft and human-crewed expeditions to explore the Moon. + + 2. Document index 535: This document discusses a $65 million program called the Back to the Moon bill, which aims to encourage private companies to develop lunar orbiters. It mentions that there is a chance of making a lunar mission happen in this decade through this program. + + 3. Document index 357: This document is a request for more information on a recent newspaper article about the Japanese crashing or crash-landing a package on the Moon. It indicates that the article was vague and unclear. + + 4. Document index 321: This document speculates about what would have happened if the Soviets had beaten the United States in the Moon race. It suggests that the US would have still performed Moon landings and potentially set up a lunar base. The focus on Mars exploration would have depended on the Soviets' actions. + + 5. Document index 102: This document mentions the Hiten engineering-test mission, which spent time in a highly eccentric Earth orbit and performed lunar flybys before being inserted into lunar orbit using gravity-assist-like maneuvers. It states that the mission was expected to crash on the Moon eventually. + + Please note that the above summaries are based on the content of the documents and may not capture all the information contained within them. + +From this output, we see that an instance of a GPT decided to call the function "knn_search" from the class "TopicPrompting." Indeed, some documents on the topic "moon landing" have been found, and the model summarizes the relevant information accordingly. + +If we want to check, for instance, the document with index 102 in topic 13 to learn more about the Hiten engineering-test mission, we can simply do the following: + +.. code-block:: python + + print(tm.topic_lis[13].documents[535]) + + Output: + + .. code-block:: plaintext + + Their Hiten engineering-test mission spent a while in a highly eccentric Earth orbit doing lunar flybys, and then was inserted into lunar orbit using some very tricky gravity-assist-like maneuvering. This meant that it would crash on the Moon eventually, since there is no such thing as a stable lunar orbit (as far as anyone knows), and I believe I recall hearing recently that it was about to happen. + + +Split Topics +------------ + +We find that topic 6 about Belief and Atheism is a bit general and would thus like to split it into subtopics. Let's see what 5 potential subtopics it contains: + +.. code-block:: python + + tm.pprompt("What are 5 potential subtopics of topic 6") + + Output: + + .. code-block:: plaintext + + GPT wants to call the function: { + "name": "split_topic_kmeans", + "arguments": "{\n \"topic_idx\": 6,\n \"n_clusters\": 5\n}" + } + + Here are five potential subtopics of topic 6: + + 1. Existence of God: This subtopic explores arguments, evidence, proofs, and the existence of God. + + 2. Atheism vs Theism: This subtopic delves into the concepts of atheism, theism, atheists, and theists, and the debates surrounding these belief systems. + + 3. Belief and Faith: This subtopic focuses on beliefs, faith, believers, and the roles they play in religious and philosophical contexts. + + 4. Rationality and Reasoning: This subtopic involves discussions on reason, rationality, logic, and argumentation as they relate to beliefs and debates about religion. + + 5. Morality and Ethics: This subtopic explores moral values, ethics, morals, and how they influence and are influenced by religious beliefs and debates. + + These subtopics were inferred from the words provided and the descriptions of the topics in the corpus used for topic modeling. + +Based on those results, we decide to not actually split the topic into 5 subtopics but rather just 3 - One on "religious faith," one on "atheism," and one on "ethics and philosophy." We also want to actually make those changes to the existing topics, so we tell TopicGPT to do the changes inplace. + +.. code-block:: python + + tm.pprompt("Please split topic 6 into subtopics based on the keywords 'religious faith', 'atheism' and 'ethics and philosophy'. Do this inplace.") + +``` + +Output: + + .. code-block:: plaintext + + + I have split topic 6 into three subtopics based on the provided keywords: "religious faith", "atheism", and "ethics and philosophy". Here are the new subtopics: + + 1. Topic 21: Religious Beliefs + - This topic encompasses the general theme of "Religion and Belief". + - Subtopics and aspects of this topic include: + - Faith and Beliefs: Believing, belief, true, evidence, truth. + - Religious Practices: Praying, heretic, bible, church. + - Religious Communities: Christian, Muslim, atheist. + - Interpretation and Understanding: Interpretations, interpretation, understanding, context. + - Morality and Ethics: Good, wrong, punishment, violence. + + 2. Topic 22: Existence of God + - This topic focuses on "Atheism and Belief". + - Subtopics and aspects of this topic include: + - Arguments and Evidence: Atheists provide evidence for their beliefs. + - Existence of God: Atheists question the existence of God. + - Religious Beliefs: Atheists challenge religious beliefs. + - Science and Reason: Atheists rely on science and reason. + - Morality and Ethics: Atheists discuss morality and ethics. + + 3. Topic 23: Ethics and Morality + - This topic revolves around "Morality and Ethics". + - Subtopics and aspects of this topic include: + - Moral reasoning: "Argument," "reasoning," "logic," "fallacy," "premises." + - Ethical principles: "Right," "wrong," "immoral," "morals," "ethical." + - Moral judgments: "Moral," "ethical," "morality," "judgments," "values." + - Human behavior: "Homosexual," "sexual," "promiscuous," "behavior," "sex." + - Scientific perspective: "Science," "scientific," "evidence," "empirical," "studies." + Please note that these subtopics are based on the keywords provided and may not cover all aspects of the original topic. + + +We see that TopicGPT performed the splitting as intended. However, the names and descriptions of the topics got changed slightly in order to optimally fit to the documents that actually constitute the topics based on the keywords. (If you like you can rename them at any time). + +Combining topics +=============== + +Since topics 15 ("Hockey Games") and 17 ("Baseball games and teams") are both about sports, we want to combine them into a single topic. + +.. code-block:: python + + tm.pprompt("Please combine topics 15 and 17. Do this inplace.") + +Output +------ + +GPT wants to the call the function: + +.. code-block:: json + + { + "name": "combine_topics", + "arguments": "{\n \"topic_idx_lis\": [15, 17],\n \"inplace\": true\n}" + } + +The topics 15 and 17 have been combined into a new topic called "Sports". This topic includes aspects and sub-topics related to sports such as team and players, games and seasons, performance and skills, fans and audience, and statistics and records. Some of the common words found in this topic include "team," "players," "hockey," "baseball," "game," "games," "season," "playoffs," "good," "better," "win," "hit," "score," "fans," "series," "watch," "fan," "stats," "record," "pts," and "career". + +Tips and tricks for prompting TopicGPT +--------------------------------------- + +When using the "pprompt" or "prompt" function, TopicGPT can behave differently than intended. To alleviate those issues some simple tricks can help: + +- Explicitly tell the model which function it should use and which parameters to select. (Sometimes the model simply cannot know what you except it to do.) For example, instead of using ``tm.pprompt("What are the subtopic of topic 13?")``, use something like ``tm.pprompt("What are the subtopic of topic 13? Please use the function that uses the k-means algorithm to split the topic. Use a parameter of k = 5 and do this inplace")``. + +- Just ask the same prompt again. Since TopicGPT is a stochastic system, calling the same function with the same argument again might yield a different functionality to be used or a different result. + +- If this doesn't help, you can also directly call the function you want to use from the TopicPrompting class. In the example above you could do ``tm.topic_prompting.split_topic_kmeans(topic_idx=13, n_clusters=5, inplace=True)``. Note that all functions the model can call can also be called directly. + +- In case of hallucination of facts it may help to use GPT-4 for TopicGPT + + + +How TopicGPT works +================== + +TopicGPT is centrally built on top of text embeddings and the prompting mechanisms obtained via LLMs and provided by the OpenAI API. Please also see the section `References <#references_>`_ for more details on the models and ideas used in TopicGPT. + +Embeddings +---------- + +When no embeddings are provided, TopicGPT automatically computes the embeddings of the documents of the provided corpus and also of the vocabulary that is extracted from the corpus. This happens after the fit-method is called. + +The class ``GetEmbeddingsOpenAI`` is used for this purpose. + +Clustering +---------- + +In order to identify topics among the documents, TopicGPT reduces the dimensionality of the document embeddings via UMAP and then uses Hdbscan to identify the clusters. Dimensionality reduction is necessary since the document embeddings are of very high dimensionality, and thus the curse of dimensionality would make it very difficult, if not impossible, to identify the clusters. + +When not specifying the number of topics in the ``Topic GPT`` class, Hdbscan is used to automatically determine the number of topics. If the number of topics is specified, agglomerative clustering is used on top of the clusters identified by HDBSCAN. + +The class ``Clustering`` is used for this purpose. + +Extraction of Top-Words +------------------------ + +After the clusters have been identified, TopicGPT extracts the top-words of each topic. This is done via two different methods: + +- **Tf-idf**: The tf-idf method is based on the idea that words that occur frequently in a topic but rarely in other topics are good indicators for the topic. The top-words are thus the words with the highest tf-idf scores. + +- **Centroid similarity**: The centroid similarity method is based on the idea that the words that are closest to the centroid of a topic are good indicators for the topic. The top-words are thus the words that are closest to the centroid of the topic. + +Note that the Tf-idf heuristic was introduced for the BerTopic Model (Grootendorst, Maarten. "BERTopic: Neural topic modeling with a class-based TF-IDF procedure." arXiv preprint arXiv:2203.05794 (2022)) and a similar idea to the centroid similarity method is used in Top2Vec (Angelov, Dimo. "Top2vec: Distributed representations of topics." arXiv preprint arXiv:2008.09470 (2020)). + +Topword extraction is performed with help of the class ``ExtractTopWords``. + +Describing and naming topics +------------------------------ + +In the next step, all topics are provided with a short name and a description. This is done via prompting an LLM provided by OpenAI with around 500 top-words of each topic. The LLM then generates a short name and a description for each topic. + +The class ``TopwordEnhancement`` is used for this purpose. + +Note that computation of Embeddings, Extraction of Top-Words, and Describing and Naming Topics are all performed when calling the ``fit`` method of the ``TopicGPT`` class. + +#### Describing and naming topics + +In the next step, all topics are provided with a short name and a description. This is done via prompting an LLM provided by OpenAI with around 500 top-words of each topic. The LLM then generates a short name and a description for each topic. + +The class ```TopwordEnhancement``` is used for this purpose. + + +Note that computation of Embeddings, Extraction of Top-Words and Describing and Naming Topics are all performed when calling the ```fit``` method of the ```TopicGPT``` class. + +Prompting +--------- + +The main way to interact with TopicGPT is via direct textual prompts. Those prompts are augmented with basic information about desired behavior and potentially useful information. Additionally, information on available functions and their parameters is provided. Then this information is used to prompt an LLM via the OpenAI API. The LLM then decides if it should call a function of the ones provided and if so, which parameters to use. The respective function call is executed, and part of the result is returned to the LLM, which uses the original prompt together with the function call and the result to generate a response. + +Functions available for prompting +--------------------------------- + +The following functions are available for the LLM to use: + +- ``knn_search``: This function is used to find documents that are related to a certain keyword. The LLM can specify the number of documents to be found and the number of keywords to be used. The result is retrieved by performing retrieval-augmented-generation (RAG) where the query is embedded, and the most similar documents are retrieved. + +- ``identify_topic_idx``: This function is used to identify the topic that is most related to a certain keyword. This is simply done by providing all topic descriptions to the LLM and then asking for the index of the topic that is most related to the keyword. + +- ``get_topic_information``: This function is used to obtain information on certain topics. This can be useful to compare similar topics. + +- ``split_topic_kmeans``: This function is used to split a topic into subtopics. The LLM can specify the number of subtopics to be created. The result is retrieved by performing k-means clustering on the document embeddings of the documents in the topic. Note that when splitting a topic, the top-words are not completely recomputed, but rather the top-words of the "super"-topic are distributed among the subtopics. + +- ``split_topic_hdbscan``: Works analogously to ``split_topic_kmeans`` but uses Hdbscan instead of k-means clustering. This means that the number of subtopics is not specified by the user but rather automatically determined by Hdbscan. + +- ``split_topic_keywords``: This function is used to split a topic into subtopics based on provided keywords. Each keyword is embedded, and the topic is split according to cosine similarity of the document embeddings within the "super"-topic. This means that documents among the "super"-topic that are most similar to a certain keyword are assigned to the corresponding subtopic. + +- ``add_new_topic_keyword``: This function is used to add a new topic based on a keyword. The documents belonging to this new topic are computed as the documents from all other topics that are more similar to the embedding of the new keyword than the centroid of the original topic. Then all topwords and the topic description are recomputed. + +- ``delete_topic``: This function is used to delete a topic. The LLM can specify the topic to be deleted. The result is retrieved by simply removing the topic from the list of topics and assigning the documents of the deleted topic to the topic with the most similar centroid. Then all topwords and the topic description are recomputed. + +- ``combine_topics``: This function is used to combine two topics into a single topic. The LLM can specify the two topics to be combined. The result is retrieved by simply combining the documents of the two topics and re-computing the embeddings and top-words of the new topic. + + + +Limitations and Caveats +------------------------ + +It is important to note that, as a model built on top of inherently stochastic LLMs and all their shortcomings, TopicGPT has several limitations and shortcomings as well. The following list is not aimed at being complete but could provide useful information on what may go wrong when using TopicGPT: + +- **Hallucination**: LLMs are well known for yielding incorrect but coherent and plausible answers that seem convincing but are actually just made up. Although we tried to minimize this undesired behavior through carefully designing the used prompts, we found that TopicGPT may hallucinate (especially) with respect to the following aspects: + + - Making up, distorting, or misinterpreting content of documents retrieved via knn-search. + - Incorrectly naming and describing topics based on top-words. Specifically, the model can identify topics that seem coherent and reasonable, although the corresponding documents are not actually related. + +- **Undesired Behavior**: When using the "prompt" or "pprompt" function, TopicGPT may not call the function you intended it to call. This can be alleviated by explicitly telling the model which function to use or directly calling the function yourself. + +- **Stochasticity**: The behavior of TopicGPT is not deterministic and exhibits some randomness. There is always some probability that certain actions do not work as intended at the first try because some components of the LLM do not function as desired. Simply trying again should mostly help with those issues. + + - On the other hand, TopicGPT may also be overly cautious and report that no relevant information has been found or no topic exists that matches a certain keyword, even though it does. This could be caused by designing prompts to prevent the massive occurrence of falsely positive results. + + Note that using GPT-4 in TopicGPT can help to significantly alleviate issues with hallucination. + +- **Erroneous Embeddings**: The document- and word-embeddings used in TopicGPT may not always reflect the actual semantics of the texts correctly. More specifically, the embeddings sometimes reflect, for instance, grammatical or orthographical aspects such that clusters based on those aspects may be identified. + +References +---------- + +The following models, software packages, and ideas are central for TopicGPT: + +- **UMAP**: The Uniform Manifold Approximation and Projection for Dimension Reduction algorithm is used for reducing the dimensionality of document- and word embeddings (McInnes, Leland, John Healy, and James Melville. "Umap: Uniform manifold approximation and projection for dimension reduction." arXiv preprint arXiv:1802.03426 (2018)). + +- **HDBSCAN**: Hierarchical density-based clustering is used to identify the clusters among the dimensionality reduced topics (McInnes, Leland, John Healy, and Steve Astels. "hdbscan: Hierarchical density-based clustering." J. Open Source Softw. 2.11 (2017): 205). + +- **Agglomerative Clustering**: The agglomerative clustering functionality from sklearn is used to combine topics in case the identified number of clusters exceeds the number of topics specified by the user (Pedregosa, Fabian, et al. "Scikit-learn: Machine learning in Python." the Journal of machine Learning research 12 (2011): 2825-2830., https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html). + +- **Topword extraction**: Even though the corresponding packages are not directly used, the topword extraction methods used for this package are based on very similar ideas as found in the BerTopic Model (Grootendorst, Maarten. "BERTopic: Neural topic modeling with a class-based TF-IDF procedure." arXiv preprint arXiv:2203.05794 (2022)) in the case of the tf-idf method and in Top2Vec for the centroid-similarity method (Angelov, Dimo. "Top2vec: Distributed representations of topics." arXiv preprint arXiv:2008.09470 (2020)). + +- **LLMs from the GPT family**: Some references for the models for computing embeddings and answering the prompts include: + + - Brown, Tom B., et al. “Language Models are Few-Shot Learners.” Advances in Neural Information Processing Systems 33 (2020). + + - Radford, Alec, et al. “GPT-4: Generative Pre-training of Transformers with Discrete Latent Variables.” arXiv preprint arXiv:2302.07413 (2023). + + - Radford, Alec, et al. “Improving Language Understanding by Generative Pre-Training.” URL: https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf. [6] + + - Radford, Alec, et al. “Language Models are Unsupervised Multitask Learners.” OpenAI Blog 1.8 (2019): 9. [7] + diff --git a/LLMTopicDetection_TopicGPT/docs/make.bat b/LLMTopicDetection_TopicGPT/docs/make.bat new file mode 100644 index 0000000..dc1312a --- /dev/null +++ b/LLMTopicDetection_TopicGPT/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/LLMTopicDetection_TopicGPT/docs/requirements.txt b/LLMTopicDetection_TopicGPT/docs/requirements.txt new file mode 100644 index 0000000..6fbf1df --- /dev/null +++ b/LLMTopicDetection_TopicGPT/docs/requirements.txt @@ -0,0 +1,18 @@ +gensim +hdbscan +nltk +numpy +openai +pandas +plotly +regex +scikit-learn +seaborn +sentence-transformers +tiktoken +tokenizers +tqdm +umap-learn +umap-learn[plot] +sphinx +sphinx_rtd_theme diff --git a/LLMTopicDetection_TopicGPT/docs/source/conf.py b/LLMTopicDetection_TopicGPT/docs/source/conf.py new file mode 100644 index 0000000..2bfdf90 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/docs/source/conf.py @@ -0,0 +1,34 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +master_doc = 'index' +project = 'topicgpt' +copyright = '2023, ArikReuter' +author = 'ArikReuter' +release = '0.0.4' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon'] + +templates_path = ['_templates'] +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'sphinx_rtd_theme' +html_static_path = ['_static'] + + +import os +import sys +sys.path.insert(0, os.path.abspath('../../src')) +sys.path.insert(0, os.path.abspath('../src')) \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/docs/source/index.rst b/LLMTopicDetection_TopicGPT/docs/source/index.rst new file mode 100644 index 0000000..5c6df60 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/docs/source/index.rst @@ -0,0 +1,22 @@ +.. topicgpt documentation master file, created by + sphinx-quickstart on Wed Sep 6 20:34:08 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to topicgpt's documentation! +==================================== + +.. include:: ../README.rst + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + topicgpt + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/LLMTopicDetection_TopicGPT/docs/source/modules.rst b/LLMTopicDetection_TopicGPT/docs/source/modules.rst new file mode 100644 index 0000000..41f8b74 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/docs/source/modules.rst @@ -0,0 +1,7 @@ +topicgpt +======== + +.. toctree:: + :maxdepth: 4 + + topicgpt diff --git a/LLMTopicDetection_TopicGPT/docs/source/topicgpt.rst b/LLMTopicDetection_TopicGPT/docs/source/topicgpt.rst new file mode 100644 index 0000000..03e9b1c --- /dev/null +++ b/LLMTopicDetection_TopicGPT/docs/source/topicgpt.rst @@ -0,0 +1,69 @@ +topicgpt package +================ + +Submodules +---------- + +topicgpt.Clustering module +-------------------------- + +.. automodule:: topicgpt.Clustering + :members: + :undoc-members: + :show-inheritance: + +topicgpt.ExtractTopWords module +------------------------------- + +.. automodule:: topicgpt.ExtractTopWords + :members: + :undoc-members: + :show-inheritance: + +topicgpt.GetEmbeddingsOpenAI module +----------------------------------- + +.. automodule:: topicgpt.GetEmbeddingsOpenAI + :members: + :undoc-members: + :show-inheritance: + +topicgpt.TopicGPT module +------------------------ + +.. automodule:: topicgpt.TopicGPT + :members: + :undoc-members: + :show-inheritance: + +topicgpt.TopicPrompting module +------------------------------ + +.. automodule:: topicgpt.TopicPrompting + :members: + :undoc-members: + :show-inheritance: + +topicgpt.TopicRepresentation module +----------------------------------- + +.. automodule:: topicgpt.TopicRepresentation + :members: + :undoc-members: + :show-inheritance: + +topicgpt.TopwordEnhancement module +---------------------------------- + +.. automodule:: topicgpt.TopwordEnhancement + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: topicgpt + :members: + :undoc-members: + :show-inheritance: diff --git a/LLMTopicDetection_TopicGPT/examples/AmazonReviews.ipynb b/LLMTopicDetection_TopicGPT/examples/AmazonReviews.ipynb new file mode 100644 index 0000000..3b334f1 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/examples/AmazonReviews.ipynb @@ -0,0 +1,821 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example Usage of TopicGPT: Amazon Reviews" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we will be using the Amazon Reviews dataset to show how TopicGPT can be useful when analyzing a large corpus of text." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\umap\\distances.py:1063: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", + " @numba.jit()\n", + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\umap\\distances.py:1071: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", + " @numba.jit()\n", + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\umap\\distances.py:1086: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", + " @numba.jit()\n", + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\umap\\umap_.py:660: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", + " @numba.jit()\n", + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\umap\\plot.py:203: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", + " @numba.jit()\n", + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] C:\\Users\\arik_\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\arik_\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ], + "source": [ + "from topicgpt.TopicGPT import TopicGPT" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# load api key\n", + "import os\n", + "api_key_openai = os.environ.get('OPENAI_API_KEY')\n", + "\n", + "import openai\n", + "\n", + "openai.organization = \"org-MOfdTrYSke1pXhlAdLXxwDKx\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# data from https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews?resource=download\n", + "\n", + "review_data = pd.read_csv(\"../Data/AmazonReviews/amazon_review_polarity_csv/train.csv\", header=None) # only use the first 10k reviews of the train set\n", + "\n", + "reviews = list(review_data[2])\n", + "reviews = reviews[:10000] # only consider the first 10k reviews " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "tm = TopicGPT(\n", + " openai_api_key = api_key_openai,\n", + " corpus_instruction= \"The Amazon reviews dataset consists of reviews from amazon. The data span a period of 18 years, including 10000 reviews up to March 2013.\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tm.fit(reviews)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "tm.save_embeddings() #save the computed embeddings for later use" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tm.visualize_clusters()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Topic 0: Musical genres and characteristics,\n", + " Topic 1: Sci-fi TV show.,\n", + " Topic 2: Film Genres,\n", + " Topic 3: Paranormal phenomena and UFO sightings,\n", + " Topic 4: Earbuds and Headsets,\n", + " Topic 5: Book Review Topics,\n", + " Topic 6: Gluten-free Cookbook,\n", + " Topic 7: Air Mattresses,\n", + " Topic 8: Crime and Investigation.,\n", + " Topic 9: Printer Troubleshooting,\n", + " Topic 10: Hiking Footwear,\n", + " Topic 11: Shapewear,\n", + " Topic 12: Dance Instruction,\n", + " Topic 13: Parenting and Education,\n", + " Topic 14: Electronic Gadgets,\n", + " Topic 15: Video Games,\n", + " Topic 16: MP3 Player Issues,\n", + " Topic 17: Camera Accessories,\n", + " Topic 18: Power Adapters,\n", + " Topic 19: Product Quality,\n", + " Topic 20: Ancient civilizations and anthropology.,\n", + " Topic 21: Router Connectivity,\n", + " Topic 22: Technical Issues,\n", + " Topic 23: Puritanical Society,\n", + " Topic 24: Sci-fi Space Exploration,\n", + " Topic 25: Beauty Products,\n", + " Topic 26: Sexual Vibrators,\n", + " Topic 27: Home Safety,\n", + " Topic 28: Product Quality,\n", + " Topic 29: Customer Service Experience,\n", + " Topic 30: Textbook Quality,\n", + " Topic 31: Programming Documentation,\n", + " Topic 32: Hardware Tools,\n", + " Topic 33: Product Quality,\n", + " Topic 34: Educational Toys,\n", + " Topic 35: Appliances,\n", + " Topic 36: Kitchenware,\n", + " Topic 37: Supernatural Witches,\n", + " Topic 38: Horror Comics,\n", + " Topic 39: Dystopian society,\n", + " Topic 40: Emotional Turmoil,\n", + " Topic 41: Book genres,\n", + " Topic 42: Economic and Political Critique,\n", + " Topic 43: Poorly Written Erotica,\n", + " Topic 44: Dystopian Surveillance State,\n", + " Topic 45: Experimental Poetry,\n", + " Topic 46: Formatting Issues,\n", + " Topic 47: Language Learning Resources,\n", + " Topic 48: Book genres,\n", + " Topic 49: Home Improvement,\n", + " Topic 50: Religious Texts.]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.topic_lis" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\umap\\distances.py:1063: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", + " @numba.jit()\n", + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\umap\\distances.py:1071: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", + " @numba.jit()\n", + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\umap\\distances.py:1086: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", + " @numba.jit()\n", + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\umap\\umap_.py:660: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", + " @numba.jit()\n", + "c:\\Users\\arik_\\anaconda3\\envs\\llm_sem_test7\\Lib\\site-packages\\umap\\plot.py:203: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", + " @numba.jit()\n", + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] C:\\Users\\arik_\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\arik_\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wed Sep 6 20:17:07 2023 Building and compiling search function\n" + ] + } + ], + "source": [ + "# load the model if available\n", + "import pickle\n", + "with open(\"../Data/SavedTopicRepresentations/TopicGPT_amazonReviews.pkl\", \"rb\") as f:\n", + " tm = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see what topic 2 is about" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The common topic of the given words is \"Movie Reviews\".\n", + "\n", + "Aspects:\n", + "1. Genre: animated, slasher, noir, zombie, thriller.\n", + "2. Quality: watchable, unwatchable, dreadful, cheesy, ridiculous.\n", + "3. Filmmaking: directors, filmmakers, screenwriter, cinematography, filmmaking.\n", + "4. Audience reaction: scariest, thrilling, hilarious, disappointing, shocking.\n", + "5. Technical aspects: widescreen, dolby, surround, cinematography, special effects.\n" + ] + } + ], + "source": [ + "print(tm.topic_lis[2].topic_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT wants to the call the function: {\n", + " \"name\": \"identify_topic_idx\",\n", + " \"arguments\": \"{\\n \\\"query\\\": \\\"Avatar\\\"\\n}\"\n", + "}\n", + "GPT wants to the call the function: {\n", + " \"name\": \"get_topic_information\",\n", + " \"arguments\": \"{\\n \\\"topic_idx_lis\\\": [2]\\n}\"\n", + "}\n", + "Yes, the movie \"Avatar\" is mentioned in topic 2, which is about film genres. However, the specific context or sentiment of the mention is not provided.\n" + ] + }, + { + "data": { + "text/plain": [ + "{2: '\\n Topic index: 2\\n Topic name: Film Genres\\n Topic description: The common topic of the given words is \"Movie Reviews\".\\n\\nAspects:\\n1. Genre: animated, slasher, noir, zombie, thriller.\\n2. Quality: watchable, unwatchable, dreadful, cheesy, ridiculous.\\n3. Filmmaking: directors, filmmakers, screenwriter, cinematography, filmmaking.\\n4. Audience reaction: scariest, thrilling, hilarious, disappointing, shocking.\\n5. Technical aspects: widescreen, dolby, surround, cinematography, special effects.\\n Topic topwords: [\\'theaters\\', \\'flicks\\', \\'renting\\', \\'animated\\', \\'robots\\', \\'gang\\', \\'rental\\', \\'filmmakers\\', \\'aerial\\', \\'unrated\\', \\'dubbing\\', \\'credits\\', \\'accents\\', \\'slasher\\', \\'scares\\', \\'noir\\', \\'cop\\', \\'cartoons\\', \\'unwatchable\\', \\'watchable\\', \\'directors\\', \\'porn\\', \\'scariest\\', \\'fights\\', \\'aired\\', \\'adaptation\\', \\'locations\\', \\'wrestling\\', \\'zombie\\', \\'corny\\', \\'watches\\', \\'screenwriter\\', \\'chicks\\', \\'theatrical\\', \\'lesbian\\', \\'sequels\\', \\'villa\\', \\'countryside\\', \\'grabs\\', \\'producer\\', \\'combo\\', \\'redeem\\', \\'trapped\\', \\'marine\\', \\'idiot\\', \\'depicted\\', \\'storylines\\', \\'millions\\', \\'ruin\\', \\'flop\\', \\'thrills\\', \\'claustrophobic\\', \\'caves\\', \\'non-stop\\', \\'blockbuster\\', \\'soldier\\', \\'actresses\\', \\'cinematic\\', \\'breathtaking\\', \\'comedian\\', \\'theatres\\', \\'pilots\\', \\'airplanes\\', \\'widescreen\\', \\'steals\\', \\'grainy\\', \\'acclaimed\\', \\'surround\\', \\'channels\\', \\'innovative\\', \\'buff\\', \\'talents\\', \\'motorcycle\\', \\'soccer\\', \\'courage\\', \\'distracting\\', \\'scenario\\', \\'cult\\', \\'portrays\\', \\'compelled\\', \\'scale\\', \\'segments\\', \\'butt\\', \\'promising\\', \\'cliched\\', \\'sub\\', \\'artsy\\', \\'faces\\', \\'kills\\', \\'rubbish\\', \\'involving\\', \\'vulgar\\', \\'incoherent\\', \\'costumes\\', \\'accent\\', \\'disjointed\\', \\'gory\\', \\'gruesome\\', \\'low-budget\\', \\'dreadful\\', \\'scream\\', \\'host\\', \\'fright\\', \\'action-packed\\', \\'goofy\\', \\'caving\\', \\'driven\\', \\'nude\\', \\'re-make\\', \\'audiences\\', \\'over-the-top\\', \\'filmmaking\\', \\'popcorn\\', \\'aircraft\\', \\'newer\\', \\'sport\\', \\'kicks\\', \\'ventriloquist\\', \\'crying\\', \\'rescue\\', \\'buffs\\', \\'deluxe\\', \\'cheated\\', \\'lackluster\\', \\'dolby\\', \\'strangers\\', \\'pun\\', \\'pitiful\\', \\'adore\\', \\'suffers\\', \\'initially\\', \\'loser\\', \\'arts\\', \\'shocking\\', \\'criminals\\', \\'weapons\\', \\'lion\\', \\'eerie\\', \\'preview\\', \\'youth\\', \\'parent\\', \\'poignant\\', \\'presence\\', \\'decade\\', \\'sympathetic\\', \\'interview\\', \\'achieve\\', \\'jobs\\', \\'motivation\\', \\'blonde\\', \\'poem\\', \\'transitions\\', \\'convincing\\', \\'marketing\\', \\'consumers\\', \\'candy\\', \\'asks\\', \\'walked\\', \\'miscast\\', \\'stunts\\', \\'nasty\\', \\'zombies\\', \\'thrill\\', \\'vomit\\', \\'stupidity\\', \\'stinker\\', \\'freaking\\', \\'disgusting\\', \\'ridiculously\\', \\'farce\\', \\'chills\\', \\'brainless\\', \\'dude\\', \\'naked\\', \\'ratings\\', \\'appalling\\', \\'pretends\\', \\'disgust\\', \\'filming\\', \\'starred\\', \\'punches\\', \\'downhill\\', \\'des\\', \\'sappy\\', \\'breathless\\', \\'screening\\', \\'funniest\\', \\'rude\\', \\'tripe\\', \\'hairy\\', \\'handsome\\', \\'unfunny\\', \\'genuinely\\', \\'fighters\\', \\'spoiled\\', \\'fighter\\', \\'backdrop\\', \\'dogfight\\', \\'planes\\', \\'aviation\\', \\'fast-paced\\', \\'dogfights\\', \\'chasing\\', \\'destruction\\', \\'dramas\\', \\'silent\\', \\'specials\\', \\'westerns\\', \\'permanent\\', \\'blurry\\', \\'blu-rays\\', \\'bikers\\', \\'crazed\\', \\'transfers\\', \\'cue\\', \\'deaf\\', \\'skipping\\', \\'commercials\\', \\'gritty\\', \\'exit\\', \\'jumped\\', \\'continuity\\', \\'robbed\\', \\'knocked\\', \\'sync\\', \\'boxed\\', \\'studios\\', \\'remastered\\', \\'pacing\\', \\'boat\\', \\'headed\\', \\'lonely\\', \\'misfortune\\', \\'marry\\', \\'beg\\', \\'jungle\\', \\'alley\\', \\'suffer\\', \\'victim\\', \\'sympathy\\', \\'wouldnt\\', \\'hurts\\', \\'handled\\', \\'shouting\\', \\'crawl\\', \\'border\\', \\'intent\\', \\'flashbacks\\', \\'viewed\\', \\'spite\\', \\'throat\\', \\'sacrificing\\', \\'ticket\\', \\'painfully\\', \\'passionate\\', \\'martial\\', \\'narration\\', \\'reaction\\', \\'deaths\\', \\'neighbors\\', \\'offend\\', \\'visually\\', \\'perverse\\', \\'realy\\', \\'goal\\', \\'tender\\', \\'portray\\', \\'bullets\\', \\'engaged\\', \\'coverage\\', \\'monotonous\\', \\'unlikely\\', \\'historically\\', \\'banal\\', \\'credibility\\', \\'one-liners\\', \\'depicts\\', \\'caring\\', \\'divorced\\', \\'grave\\', \\'sincere\\', \\'reaches\\', \\'meaningful\\', \\'mild\\', \\'souls\\', \\'downright\\', \\'dramatically\\', \\'involves\\', \\'understatement\\', \\'hates\\', \\'crosses\\', \\'workers\\', \\'interactions\\', \\'overwhelming\\', \\'statues\\', \\'sum\\', \\'photographed\\', \\'ranks\\', \\'aged\\', \\'region\\', \\'post-apocalyptic\\', \\'spoil\\', \\'dud\\', \\'qualities\\', \\'merit\\', \\'borrowed\\', \\'adapted\\', \\'scripts\\', \\'weaker\\', \\'justify\\', \\'purposes\\', \\'coherent\\', \\'gratuitous\\', \\'creature\\', \\'segment\\', \\'whim\\', \\'determine\\', \\'firefighters\\', \\'ok.\\', \\'inaccurate\\', \\'warmth\\', \\'turkey\\', \\'installment\\', \\'picky\\', \\'remotely\\', \\'shell\\', \\'receipt\\', \\'complained\\', \\'phoned\\', \\'cheesey\\', \\'shooting\\', \\'freak\\', \\'cheezy\\', \\'shoots\\', \\'travesty\\', \\'plotless\\', \\'drunk\\', \\'vile\\', \\'spy\\', \\'laughably\\', \\'shameless\\', \\'actors/actresses\\', \\'screams\\', \\'embarrassing\\', \\'rape\\', \\'embarrassed\\', \\'foul\\', \\'half-hour\\', \\'chases\\', \\'retarded\\', \\'crawling\\', \\'spoiler\\', \\'lustful\\', \\'pissed\\', \\'filmmaker\\', \\'disgrace\\', \\'spliced\\', \\'depths\\', \\'uncensored\\', \\'originality\\', \\'twins\\', \\'must-see\\', \\'unreal\\', \\'mansion\\', \\'cameo\\', \\'rendering\\', \\'marvelous\\', \\'comedies\\', \\'crippled\\', \\'comedians\\', \\'marvel\\', \\'fought\\', \\'ghostly\\', \\'spooky\\', \\'bare\\', \\'phenomenal\\', \\'robot\\', \\'underrated\\', \\'beneath\\', \\'comical\\', \\'landing\\', \\'crew\\', \\'heartwarming\\', \\'mute\\', \\'finale\\', \\'teaser\\', \\'airplane\\', \\'nonetheless\\', \\'campy\\', \\'autobots\\', \\'five-star\\', \\'beating\\', \\'reruns\\', \\'marries\\', \\'fond\\', \\'flawless\\', \\'avenger\\', \\'cavalry\\', \\'faded\\', \\'laughter\\', \\'streaming\\', \\'viewings\\', \\'pixelated\\', \\'letterboxed\\', \\'dub\\', \\'biker\\', \\'bluray\\', \\'spiderman\\', \\'beloved\\', \\'beautifull\\', \\'broadcast\\', \\'boxset\\', \\'swinging\\', \\'restored\\', \\'captioning\\', \\'organ\\', \\'skips\\', \\'previews\\', \\'disapointment\\', \\'tickets\\', \\'peanuts\\', \\'holidays\\', \\'insomnia\\', \\'geared\\', \\'suit\\', \\'perfection\\', \\'crisp\\', \\'mesmerizing\\', \\'butter\\', \\'bus\\', \\'vehicle\\', \\'cloying\\', \\'butchered\\', \\'slap\\', \\'spots\\', \\'angles\\', \\'builds\\', \\'tossed\\', \\'facial\\', \\'ripping\\', \\'jumping\\', \\'manufactured\\', \\'glued\\', \\'struck\\', \\'idiots\\', \\'cliff\\', \\'bullet\\', \\'accident\\', \\'performs\\', \\'shakes\\', \\'lossless\\', \\'expired\\', \\'yell\\', \\'grass\\', \\'hurl\\', \\'walks\\', \\'flesh\\', \\'quirky\\', \\'sooo\\', \\'expedition\\', \\'stilted\\', \\'so-so\\', \\'sue\\', \\'cheating\\', \\'regrettably\\', \\'cried\\', \\'alike\\', \\'afterwards\\', \\'escapes\\', \\'headache\\', \\'den\\', \\'shy\\', \\'bisexual\\', \\'effeminate\\', \\'wit\\', \\'suffering\\', \\'cabin\\', \\'minimal\\', \\'riveting\\', \\'brow\\', \\'racism\\', \\'thankfully\\', \\'kinky\\', \\'songwriter\\', \\'mind-numbing\\', \\'leap\\', \\'artifacts\\', \\'attraction\\', \\'enticing\\', \\'mill\\', \\'collectors\\', \\'whiny\\', \\'bickering\\', \\'flawed\\', \\'ton\\', \\'adopted\\', \\'tortured\\', \\'assumed\\', \\'nominated\\', \\'maintain\\']'}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.pprompt(\"Is the movie Avatar mentioned in topic 2?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To check the output, we actually inspect the respective document at index 1498: " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not just because of the 3D, but because this is the version where they made an effort to optimize the picture! They released this on blu ray like like avatar was released. First they release the movie without any optimization and special features. Which means the picture looks better than DVD but not the best that blu ray can be.(which means a grainy looking picture that looks like the characters are in a sandstorm and there is a lack of detail that you expect in a blu ray). The they make the limited edition which is made the way a blu ray is supposed to be down. So if you are wondering which one to choose, this is the one you want! All the features with the visuals to boot!\n" + ] + } + ], + "source": [ + "print(tm.topic_lis[2].documents[1498])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us go own with the analysis. Since it is easy to loose the overview over all the topics, lets find out which one is about books" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT wants to the call the function: {\n", + " \"name\": \"identify_topic_idx\",\n", + " \"arguments\": \"{\\n \\\"query\\\": \\\"books\\\"\\n}\"\n", + "}\n", + "Topic 5 is about books.\n" + ] + }, + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.pprompt(\"Which topic is about books?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The common topic of the given words is \"Book Reviews\". \n", + "\n", + "Various aspects and sub-topics of this topic include:\n", + "1. Characters: likable, endearing, mighty, crew, pals\n", + "2. Storyline: satirical, mythological, strange, satirical, endings\n", + "3. Writing style: well-crafted, inviting, brilliantly, sarcastic\n", + "4. Themes: philosophical, allusions, belief, religion, obsession\n", + "5. Critique: uneven, dissatisfaction, novice, pale, unsuccessful\n" + ] + } + ], + "source": [ + "print(tm.topic_lis[5].topic_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT wants to the call the function: {\n", + " \"name\": \"knn_search\",\n", + " \"arguments\": \"{\\n \\\"topic_index\\\": 4,\\n \\\"query\\\": \\\"Harry Potter\\\",\\n \\\"k\\\": 5\\n}\"\n", + "}\n", + "No, there is no mention of Harry Potter in topic 5. The documents that are most closely related to the query \"Harry Potter\" do not mention the topic.\n" + ] + }, + { + "data": { + "text/plain": [ + "([\"Unable to use. Compartments too tiny and too deep to reach in to get earrings - and I don't have unusually large fingers.\",\n", + " \"I wanted an in ear blue tooth headset but couldn't get them to stay in. these made it work!\",\n", + " 'people should buy headsets to fit these things bc they are so essential..A must have for any headset that will fit them',\n", + " 'I am am a musician so I use these with earbuds coming off my computer within my headphones which is powered from my Marshall Amplifier. They transfer the sound well and are very well made.',\n", + " 'The Jabra Eargels are wonderful.. I cannot believe how comfortable, and user friendly they are. Thank you, if not for these, I could not use my bluetooth.Great Merchandise..'],\n", + " [71, 27, 30, 72, 69])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.pprompt(\"Is Harry Potter mentioned in topic 5?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Topic 0 (\"Musical genres and characteristics\") sounds a bit general and from the visual inspection it seems to contain a lot of documents. So let's break it down a little bit" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT wants to the call the function: {\n", + " \"name\": \"split_topic_kmeans\",\n", + " \"arguments\": \"{\\n \\\"topic_idx\\\": 0,\\n \\\"inplace\\\": true\\n}\"\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epochs completed: 100%| ██████████ 100/100 [00:03]\n", + "Computing word-topic matrix: 100%|██████████| 1/1 [00:01<00:00, 1.29s/it]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + " 0%| | 0/1 [00:00 1\u001b[0m tm\u001b[39m.\u001b[39mtopic_lis\n", + "\u001b[1;31mNameError\u001b[0m: name 'tm' is not defined" + ] + } + ], + "source": [ + "tm.topic_lis" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 0: Electronics Equipment Sales\n", + "\n", + "Topic_description: The common topic of the given words appears to be \"electronics and technology\". \n", + "\n", + "Various aspects and sub-topics of this topic include:\n", + "1. Buying and selling: \"offer\", \"sale\", \"sell\", \"price\", \"buy\"\n", + "2. Device usage and features: \"use\", \"get\", \"new\", \"used\", \"condition\"\n", + "3. Technical specifications: \"wire\", \"ground\", \"power\", \"circuit\", \"voltage\"\n", + "4. Communication and connectivity: \"phone\", \"email\", \"modem\", \"wireless\", \"connection\"\n", + "5. Accessories and peripherals: \"battery\", \"cable\", \"manuals\", \"disk\", \"monitor\"\n", + "Top words: [\"n't\", 'one', 'would', 'use', 'like', 'get', 'new', 'used', 'offer', 'sale']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 1: Image Processing\n", + "\n", + "Topic_description: The common topic of the given words is \"Image Processing and Graphics\". \n", + "\n", + "Aspects and sub-topics of this topic include:\n", + "1. Image Manipulation: \"file\", \"image\", \"format\", \"data\", \"color\"\n", + "2. Software and Tools: \"program\", \"software\", \"package\", \"tools\", \"library\"\n", + "3. Display and Visualization: \"window\", \"display\", \"widget\", \"graphics\", \"screen\"\n", + "4. Application and Usage: \"use\", \"application\", \"program\", \"work\", \"run\"\n", + "5. Conversion and Compatibility: \"convert\", \"version\", \"formats\", \"compatible\", \"support\"\n", + "Top words: [\"n't\", 'file', 'image', 'use', 'available', 'get', 'also', 'program', 'files', 'one']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 2: Gun control\n", + "\n", + "Topic_description: The common topic of the provided words is \"gun control\". \n", + "\n", + "Aspects and sub-topics of the topic include:\n", + "1. Public opinion and government: \"people\", \"government\", \"think\", \"believe\", \"public\"\n", + "2. Laws and regulations: \"law\", \"control\", \"laws\", \"crime\", \"illegal\"\n", + "3. Safety and crime prevention: \"safety\", \"crime\", \"violence\", \"criminals\", \"protection\"\n", + "4. Rights and individual freedom: \"rights\", \"freedom\", \"individual\", \"rights\", \"personal\"\n", + "5. Policy and legislation: \"government\", \"policy\", \"legislation\", \"enforcement\", \"regulations\"\n", + "Top words: [\"n't\", 'would', 'people', 'think', 'one', 'know', 'gun', 'get', 'like', 'government']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 3: Online Privacy and Anonymity\n", + "\n", + "Topic_description: The common topic of the given words is \"Internet Privacy\". \n", + "\n", + "Aspects of the topic include:\n", + "1. Anonymity: Protecting identity online.\n", + "2. Security: Ensuring the safety of information.\n", + "3. Privacy: Maintaining confidentiality of personal data.\n", + "4. Encryption: Securing data through coding.\n", + "5. Anonymity tools: Services to hide online activities.\n", + "Top words: ['anonymous', 'email', 'internet', 'address', 'information', \"n't\", 'privacy', 'mail', 'one', 'use']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 4: Conflict and Violence.\n", + "\n", + "Topic_description: The common topic of the provided words appears to be \"conflict and violence\". \n", + "\n", + "Aspects and sub-topics related to this topic include:\n", + "1. War and military actions: soldiers, troops, genocide, massacre, attack.\n", + "2. Government and politics: government, political, state, policy, authorities.\n", + "3. Human rights and justice: rights, civilians, ethnic, leaders, atrocities.\n", + "4. Victims and suffering: killed, dead, bodies, wounded, victims.\n", + "5. International involvement: international, countries, support, forces, nations.\n", + "Top words: [\"n't\", 'people', 'would', 'one', 'said', 'could', 'know', 'like', 'time', 'also']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 5: Computer Hardware\n", + "\n", + "Topic_description: The common topic of the provided words is computer hardware. The various aspects and sub-topics of this topic include: \n", + "- Storage: disk, drive, hard, floppy, drives, data\n", + "- Components: card, controller, board, chip, motherboard, power, connector\n", + "- Peripherals: monitor, video, printer, modem, cable, port\n", + "- Performance: speed, memory, clock, processor, cache\n", + "- Software: system, software, program, interface, drivers\n", + "- User experience: problem, work, support, problem, run, help\n", + "Top words: [\"n't\", 'drive', 'card', 'one', 'would', 'use', 'know', 'get', 'like', 'disk']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 6: Belief and Atheism\n", + "\n", + "Topic_description: The common topic of the provided words is \"Belief and Religion\". \n", + "\n", + "Aspects and sub-topics of this topic include:\n", + "1. Atheism: \"atheists\", \"atheism\", \"atheist\"\n", + "2. God and Religion: \"god\", \"religion\", \"religious\", \"religions\"\n", + "3. Faith and Beliefs: \"faith\", \"belief\", \"beliefs\"\n", + "4. Morality and Ethics: \"moral\", \"morality\", \"morals\"\n", + "5. Science and Truth: \"science\", \"truth\", \"evidence\"\n", + "\n", + "Please note that these descriptions are based solely on the provided words and may not capture the full complexity of the topic.\n", + "Top words: [\"n't\", 'one', 'would', 'people', 'think', 'believe', 'say', 'know', 'like', 'evidence']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 7: Online Discussions\n", + "\n", + "Topic_description: The common topic of the given words appears to be online discussions or forums. \n", + "\n", + "Various aspects and sub-topics of this topic include:\n", + "1. Communication: Words like \"say,\" \"post,\" \"letter,\" and \"quote\" indicate the act of expressing thoughts and opinions.\n", + "2. Interaction: Words like \"reply,\" \"discuss,\" and \"answer\" suggest engagement and exchange of ideas.\n", + "3. Opinion sharing: Words like \"think,\" \"believe,\" and \"opinions\" indicate the expression of personal viewpoints.\n", + "4. Group dynamics: Words like \"group,\" \"members,\" and \"names\" suggest the presence of a community or collective.\n", + "5. Internet culture: Words like \"internet,\" \"newsgroup,\" and \"usenet\" imply the online nature of the discussions.\n", + "Top words: [\"n't\", 'people', 'say', 'like', 'group', 'would', 'post', 'know', 'one', 'think']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 8: Computer Software\n", + "\n", + "Topic_description: The common topic of the given words is **computer software and troubleshooting**.\n", + "\n", + "Aspects and sub-topics of the topic include:\n", + "1. **Operating system**: Windows, DOS, system, memory, disk, drive.\n", + "2. **Software usage**: Program, use, run, files, software, applications.\n", + "3. **Problem-solving**: Help, support, fix, error, crashes.\n", + "4. **Hardware**: Keyboard, mouse, graphics, hardware, device.\n", + "5. **User experience**: Interface, user, easy, features, user-friendly.\n", + "Top words: [\"n't\", 'file', 'use', 'program', 'windows', 'would', 'one', 'disk', 'get', 'like']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 9: Car Features and Performance\n", + "\n", + "Topic_description: The common topic among the provided words is \"cars\" or \"automobiles\". \n", + "\n", + "Aspects and sub-topics of the topic include:\n", + "- Car features: engine, radar, clutch, transmission, gear, tires, brakes, suspension, steering, throttle, gas, body, top, convertible, interior, seat, foot, door, shifter, gears, radio, road, wheels, oil, air, gauge, dash, windows, fuel, speedo, battery, factory, automatics, manuals, tire, recall, rust, water, damage, paint, reading, stereo, sunroof, safety, lights, coupe, sport, mechanical, handling.\n", + "- Car performance: speed, power, torque, performance, highway, acceleration, handling.\n", + "- Car condition: used, problem, condition, mileage, parts, odometer, maintenance, service, repair, rust, damage.\n", + "- Car buying/selling: price, dealer, used, buying, sell, market, value, dealership, trade-in, sales, invoice, purchase, insurance.\n", + "- Car models and brands: new, model, make, year, models, brand, manufacturer.\n", + "- Car driving experience: drive, automatic, manual, clutch, transmission, gear, shift, driver, driving, pedal, road, traffic, highway, speed, control, handling.\n", + "- Car opinions and reviews: opinion, review, experience, question, opinion, mine, works, questions, opinions.\n", + "\n", + "Please note that these sub-topics are inferred based on the provided words and may not cover all possible aspects and sub-topics related to cars.\n", + "Top words: ['car', \"n't\", 'would', 'like', 'one', 'new', 'engine', 'cars', 'get', 'know']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 10: Encryption and Government\n", + "\n", + "Topic_description: The common topic of the given words is \"encryption and government access\". \n", + "\n", + "Aspects and sub-topics:\n", + "1. Encryption methods: algorithms, keys, public-key, ciphers.\n", + "2. Government access: law, enforcement, wiretap, wiretaps, agencies.\n", + "3. Privacy and security: secure, secure communication, secure system.\n", + "4. Technology and devices: chip, phone, device, hardware, computer.\n", + "5. Public opinion: debate, controversy, opinions, concerns.\n", + "Top words: [\"n't\", 'key', 'would', 'encryption', 'government', 'use', 'one', 'chip', 'keys', 'people']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 11: Technology and Computing.\n", + "\n", + "Topic_description: The common topic of the given words is \"Plumbing\". The various aspects and sub-topics of this topic include: \n", + "1. \"Pleasure\" and \"Pleasing\" - Satisfaction and enjoyment in plumbing work.\n", + "2. \"Plenty\" and \"Plentiful\" - Abundance of plumbing supplies.\n", + "3. \"Plugs\" and \"Plugging\" - Sealing and fitting pipes.\n", + "4. \"Plunger\" and \"Plunged\" - Unclogging drains and toilets.\n", + "5. \"Plumbing Fixtures\" - Various components used in plumbing systems.\n", + "Top words: ['a.a', 'plesetsk', 'plenty', 'plentiful', 'pledged', 'pledge', 'pleasures', 'pleasure', 'pleasing', 'pleases']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 12: Technology and Computing\n", + "\n", + "Topic_description: The common topic of the given words is \"Plants\". \n", + "\n", + "Aspects and sub-topics of the topic \"Plants\":\n", + "1. Pleasure and enjoyment: pleasures, pleasure, pleasing, pleases, pleased.\n", + "2. Pledges and commitments: pledged, pledge.\n", + "3. Variety and abundance: plenty, plentiful, plethora.\n", + "4. Play and entertainment: plays, playmates, playmation, playground.\n", + "5. Plumbing and construction: plumbing, plugs, plugging, pliers, plunger.\n", + "Top words: ['a.a', 'plesetsk', 'plenty', 'plentiful', 'pledged', 'pledge', 'pleasures', 'pleasure', 'pleasing', 'pleases']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 13: Space Exploration\n", + "\n", + "Topic_description: The common topic of the given words is \"space exploration\". \n", + "\n", + "Aspects and sub-topics of the topic include:\n", + "1. Spacecraft and Missions: Launch, orbit, satellite, mission, space probes.\n", + "2. Technology and Systems: Systems, data, technology, software, hardware.\n", + "3. Astronomical Objects: Planets, moon, stars, universe, galaxies.\n", + "4. Challenges and Problems: Cost, funding, safety, technical difficulties.\n", + "5. Scientific Research: Astronomy, physics, scientific discoveries, research projects.\n", + "Top words: [\"n't\", 'would', 'space', 'one', 'like', 'could', 'launch', 'also', 'orbit', 'time']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 14: Motorcycle Riding Techniques\n", + "\n", + "Topic_description: The common topic of the provided words is \"Motorcycling\". \n", + "\n", + "Aspects and sub-topics of this topic include:\n", + "1. Riding experience: \"ride\", \"riding\", \"bikes\", \"road\", \"miles\"\n", + "2. Motorcycle maintenance: \"oil\", \"engine\", \"gas\", \"exhaust\", \"brake\"\n", + "3. Safety and gear: \"helmet\", \"gear\", \"traffic\", \"lights\", \"helmets\"\n", + "4. Road conditions and hazards: \"lane\", \"traffic\", \"curve\", \"speed\", \"pothole\"\n", + "5. Tips and advice: \"advice\", \"tips\", \"technique\", \"recommend\", \"learn\"\n", + "Top words: [\"n't\", 'bike', 'one', 'would', 'get', 'like', 'time', 'car', 'much', 'dog']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 15: Technology\n", + "\n", + "Topic_description: The common topic of the provided words is \"planning and activities\". \n", + "\n", + "Sub-topics and aspects of this topic include:\n", + "1. Pledges and pleas: pledge, pledge, please, pleased, pledging\n", + "2. Pleasure and enjoyment: pleasure, pleasing, pleasant, pleasures\n", + "3. Play and entertainment: plays, playmates, playoffs, playmation\n", + "4. Planting and growth: plants, planting, planters, planted\n", + "5. Policing and policies: police, policy, policing, policies\n", + "Top words: ['a.a', 'plesetsk', 'plenty', 'plentiful', 'pledged', 'pledge', 'pleasures', 'pleasure', 'pleasing', 'pleases']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 16: Hockey Games\n", + "\n", + "Topic_description: The common topic of the given words is \"hockey\". \n", + "\n", + "Aspects of the topic include: \n", + "1. Game: \"play\", \"games\", \"played\", \"scoring\", \"shot\"\n", + "2. Teams: \"team\", \"teams\", \"players\", \"player\", \"defense\"\n", + "3. Seasons: \"season\", \"year\", \"years\", \"playoffs\", \"regular\"\n", + "4. Goals: \"goal\", \"goals\", \"scored\", \"net\", \"scoring\"\n", + "5. Fans: \"fans\", \"watch\", \"watching\", \"crowd\", \"cheer\"\n", + "Top words: [\"n't\", 'game', 'team', 'would', 'games', 'play', 'hockey', 'period', 'one', 'season']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 17: Health and Medicine.\n", + "\n", + "Topic_description: The common topic of the given words appears to be \"health and medicine\". \n", + "\n", + "Aspects and sub-topics related to this topic include:\n", + "1. Diseases and conditions: \"patients\", \"disease\", \"infection\", \"cancer\", \"symptoms\"\n", + "2. Medical treatment: \"treatment\", \"medicine\", \"therapy\", \"drug\", \"clinical\"\n", + "3. Health information and research: \"health\", \"research\", \"medical\", \"evidence\", \"studies\"\n", + "4. Prevention and risk factors: \"prevention\", \"risk\", \"causes\", \"risk\", \"factors\"\n", + "5. Healthcare professionals: \"doctor\", \"physician\", \"doctors\", \"physicians\", \"hospital\"\n", + "\n", + "Please note that these are general sub-topics inferred from the provided words and may not cover all possible aspects within the topic.\n", + "Top words: [\"n't\", 'one', 'would', 'patients', 'disease', 'people', 'use', 'know', 'also', 'like']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 18: Baseball games and teams.\n", + "\n", + "Topic_description: The common topic of the given words is \"baseball\". \n", + "\n", + "Aspects and sub-topics of the topic \"baseball\" include: \n", + "1. Teams and players: \"team\", \"players\", \"pitcher\", \"hitter\", \"fielder\"\n", + "2. Game elements: \"game\", \"hit\", \"runs\", \"ball\", \"run\"\n", + "3. Performance and skills: \"good\", \"better\", \"average\", \"pitching\", \"batting\"\n", + "4. Seasons and years: \"year\", \"last\", \"season\", \"years\", \"career\"\n", + "5. Strategy and analysis: \"think\", \"strategy\", \"stats\", \"record\", \"analysis\"\n", + "Top words: [\"n't\", 'would', 'year', 'game', 'team', 'think', 'one', 'good', 'last', 'games']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "Topic 19: Beliefs about Homosexuality.\n", + "\n", + "Topic_description: The common topic of the given words appears to be \"religion and morality\". \n", + "\n", + "Aspects and sub-topics of this topic include:\n", + "1. Beliefs and faith: \"sin\", \"faith\", \"belief\", \"eternal\", \"heaven\"\n", + "2. Morality and ethics: \"homosexuality\", \"marriage\", \"sinful\", \"sexual\", \"wrong\"\n", + "3. Biblical interpretation: \"passage\", \"context\", \"interpretation\", \"translation\", \"verses\"\n", + "4. Religious practices: \"baptism\", \"worship\", \"ceremony\", \"churches\", \"prayer\"\n", + "5. Salvation and redemption: \"salvation\", \"repent\", \"saved\", \"forgiveness\", \"sins\"\n", + "Top words: [\"n't\", 'would', 'one', 'people', 'sin', 'think', 'say', 'know', 'believe', 'homosexuality']\n", + "\n", + "------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "tm.print_topics()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tm.visualize_clusters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Obtain more detailed information about the topics" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT wants to the call the function: {\n", + " \"name\": \"knn_search\",\n", + " \"arguments\": \"{\\n \\\"topic_index\\\": 13,\\n \\\"query\\\": \\\"moon landing\\\"\\n}\"\n", + "}\n", + "Topic 13, which is related to the keyword \"moon landing,\" contains information about various aspects of space exploration and missions to the Moon. Here are some key points:\n", + "\n", + "1. The United States has sent automated spacecraft and human-crewed expeditions to explore the Moon. These missions have provided significant knowledge and understanding of the lunar surface.\n", + " - Document index: 258\n", + "\n", + "2. NASA's automated spacecraft for solar system exploration come in various shapes and sizes. Each spacecraft consists of scientific instruments selected for specific missions and is supported by basic subsystems for electrical power, trajectory control, and data communication with Earth.\n", + " - Document index: 535\n", + "\n", + "3. The Mariner missions, conducted between 1962 and 1975, played a crucial role in the early planetary reconnaissance of the Moon and other terrestrial planets like Venus and Mars.\n", + " - Document index: 357, 321\n", + "\n", + "4. There was a proposal for a Back to the Moon bill that aimed to incentivize private companies to develop lunar orbiters and explore the Moon. The program had a budget cap of $65 million.\n", + " - Document index: 102\n", + "\n", + "5. There have been discussions and speculations about the possibility of alternative approaches to lunar missions, such as launching from altitude instead of ground pads and using bio-engineered CO2 absorbing plants for life support.\n", + " - Document index: 165, 458\n", + "\n", + "6. There have been ideas and discussions about creating a lunar habitat, developing cost-effective moon missions, and setting up a moon base.\n", + " - Document index: 323, 50, 344, 545\n", + "\n", + "7. There have been debates regarding the authenticity of the first spacewalk and speculation about the possibility of staging a fake moon landing.\n", + " - Document index: 571\n", + "\n", + "Please note that the information is a summary of the key points found in the documents related to topic 13. For more detailed information, please refer to the corresponding document indices mentioned.\n" + ] + }, + { + "data": { + "text/plain": [ + "(['This file and other text and image files from JPL missions are available from the JPL Info public access computer site, reachable by Internet via anonymous ftp to pubinfo.jpl.nasa.gov (128.149.6.2); or by dialup modem to +1 (818) 354-1333, up to 9600 bits per second, parameters N-8-1. ----------------------------------------------------------------- Our Solar System at a Glance Information Summary PMS 010-A (JPL) June 1991 JPL 410-34-1 6/91 NASA National Aeronautics and Space Administration Jet Propulsion Laboratory California Institue of Technology Pasadena, California For a printed copy of this publication contact the public mail office at the NASA center in your geographic region. INTRODUCTION From our small world we have gazed upon the cosmic ocean for untold thousands of years. Ancient astronomers observed points of light that appeared to move among the stars. They called these objects planets, meaning wanderers, and named them after Roman deities -- Jupiter, king of the gods; Mars, the god of war; Mercury, messenger of the gods; Venus, the god of love and beauty, and Saturn, father of Jupiter and god of agriculture. The stargazers also observed comets with sparkling tails, and meteors or shooting stars apparently falling from the sky. Science flourished during the European Renaissance. Fundamental physical laws governing planetary motion were discovered, and the orbits of the planets around the Sun were calculated. In the 17th century, astronomers pointed a new device called the telescope at the heavens and made startling discoveries. But the years since 1959 have amounted to a golden age of solar system exploration. Advancements in rocketry after World War II enabled our machines to break the grip of Earth\\'s gravity and travel to the Moon and to other planets. The United States has sent automated spacecraft, then human-crewed expeditions, to explore the Moon. Our automated machines have orbited and landed on Venus and Mars; explored the Sun\\'s environment; observed comets, and made close-range surveys while flying past Mercury, Jupiter, Saturn, Uranus and Neptune. These travelers brought a quantum leap in our knowledge and understanding of the solar system. Through the electronic sight and other \"senses\" of our automated spacecraft, color and complexion have been given to worlds that for centuries appeared to Earth-bound eyes as fuzzy disks or indistinct points of light. And dozens of previously unknown objects have been discovered. Future historians will likely view these pioneering flights through the solar system as some of the most remarkable achievements of the 20th century. AUTOMATED SPACECRAFT The National Aeronautics and Space Administration\\'s (NASA\\'s) automated spacecraft for solar system exploration come in many shapes and sizes. While they are designed to fulfill separate and specific mission objectives, the craft share much in common. Each spacecraft consists of various scientific instruments selected for a particular mission, supported by basic subsystems for electrical power, trajectory and orientation control, as well as for processing data and communicating with Earth. Electrical power is required to operate the spacecraft instruments and systems. NASA uses both solar energy from arrays of photovoltaic cells and small nuclear generators to power its solar system missions. Rechargeable batteries are employed for backup and supplemental power. Imagine that a spacecraft has successfully journeyed millions of miles through space to fly but one time near a planet, only to have its cameras and other sensing instruments pointed the wrong way as it speeds past the target! To help prevent such a mishap, a subsystem of small thrusters is used to control spacecraft. The thrusters are linked with devices that maintain a constant gaze at selected stars. Just as Earth\\'s early seafarers used the stars to navigate the oceans, spacecraft use stars to maintain their bearings in space. With the subsystem locked onto fixed points of reference, flight controllers can keep a spacecraft\\'s scientific instruments pointed at the target body and the craft\\'s communications antennas pointed toward Earth. The thrusters can also be used to fine-tune the flight path and speed of the spacecraft to ensure that a target body is encountered at the planned distance and on the proper trajectory. Between 1959 and 1971, NASA spacecraft were dispatched to study the Moon and the solar environment; they also scanned the inner planets other than Earth -- Mercury, Venus and Mars. These three worlds, and our own, are known as the terrestrial planets because they share a solid-rock composition. For the early planetary reconnaissance missions, NASA employed a highly successful series of spacecraft called the Mariners. Their flights helped shape the planning of later missions. Between 1962 and 1975, seven Mariner missions conducted the first surveys of our planetary neighbors in space. All of the Mariners used solar panels as their primary power source. The first and the',\n", + " 'Although the $1 billion scheme is a fantasy (it\\'s an old canard in the space business called \"trolling for billionaires\"), there is a good chance that a much smaller program ($65 million) will pass the 103rd Congress. This is the Back to the Moon bill, put together by the people who passed the Launch Services Purchase Act. The bill would incent private companies to develop lunar orbiters, with vendors selected on the basis of competitive bidding. There is an aggregate cap on the bids of $65 million. Having a single rich individual paying billions for lunar missions is probably worse than having the government bankroll a $65 million program, as the Delta Clipper program has shown (DC-X was funded by SDIO at $59 million). We have a clear chance of making a lunar mission happen in this decade - as opposed to simply wishing for our dreams to come true. Please support the Back to the Moon bill. For more information, please send E-mail with your U.S. postal service address.',\n", + " \"Afraid I can't give any more info on this.. and hoping someone in greter NETLAND has some details. A short story in the newspaper a few days ago made some sort of mention about how the Japanese, using what sounded like a gravity assist, had just managed to crash (or crash-land) a package on the moon. the article was very vague and unclear. and, to make matters worse, I didn't clip it. does this jog anyone's memory? \",\n", + " \"Suppose the Soviets had managed to get their moon rocket working and had made it first. They could have beaten us if either: * Their rocket hadn't blown up on the pad thus setting them back, and/or * A Saturn V went boom. If they had beaten us, I speculate that the US would have gone head and done some landings, but we also would have been more determined to set up a base (both in Earth Orbit and on the Moon). Whether or not we would be on Mars by now would depend upon whether the Soviets tried to go. Setting up a lunar base would have stretched the budgets of both nations and I think that the military value of a lunar base would outweigh the value of going to Mars (at least in the short run). Thus we would have concentrated on the moon. \",\n", + " ' Their Hiten engineering-test mission spent a while in a highly eccentric Earth orbit doing lunar flybys, and then was inserted into lunar orbit using some very tricky gravity-assist-like maneuvering. This meant that it would crash on the Moon eventually, since there is no such thing as a stable lunar orbit (as far as anyone knows), and I believe I recall hearing recently that it was about to happen.',\n", + " 'COMMERCIAL SPACE NEWS/SPACE TECHNOLOGY INVESTOR NUMBER 22 This is number twenty-two in an irregular series on commercial space activities. The commentaries included are my thoughts on these developments. Sigh... as usual, I\\'ve gotten behind in getting this column written. I can only plead the exigency of the current dynamics in the space biz. This column is put together at lunch hour and after the house quiets down at night, so data can quickly build up if there\\'s a lot of other stuff going on. I\\'ve complied a lot of information and happenings since the last column, so I\\'m going to have to work to keep this one down to a readable length. Have fun! CONTENTS: 1- US COMMERCIAL SPACE SALES FLATTEN IN 1993 2- DELTA WINS TWO KEY LAUNCH CONTRACTS 3- COMMERCIAL REMOTE SENSING VENTURE GETS DOC \"GO-AHEAD\" 4- INVESTMENT FIRM CALLS GD\\'S SPACE BIZ \"STILL A GOOD INVESTMENT\" 5- ARIANE PREDICTS DIP IN LAUNCH DEMAND 6- NTSB INVESTIGATES PEGASUS LAUNCH OVER ABORTED ABORT 7- ANOTHER PEGASUS COMPETITOR IS ANNOUNCED 8- GEORGIA LAUNCH SITE DROPPED FROM PLANNING 9- SPAIN\\'S CAPRICORNIA LAUNCHER STILL PROCEEDING 10- PACASTRO SIGNS LAUNCH RESERVATION WITH SWEDISH SPACE CORP 11- CHINA AND TAIWAN JOINT SATELLITE VENTURE REPORTED 12- SOUTH KOREA ANNOUNCES NATIONAL MOVE INTO SPACE TECHNOLOGIES 13- SPACE TECHNOLOGY INDEXES THROUGH MARCH FINAL NOTES ARTICLES -------------------------------------------------------------------- 1- US COMMERCIAL SPACE SALES FLATTEN IN 1993 The US Department of Commerce projects US commercial space sales will remain flat in 1993, with current data showing only a 2 percent growth over 1992. As published in \"US Industrial Outlook 1993\" (which was released in January), revenues from the 1993 US space business are currently projected to be about $4,890 M. In contrast to previous years when US commercial space sales had shown double digits growth rates, this year\\'s projected results are driven by the US satellite manufacturing industry, where sales are projected to drop from 12 satellites worth $1,300 M in 1992 to 7 satellites worth $ 670 M in 1993. The US Industrial Outlook also projects U.S. commercial launchers faces flat demand in coming year, and while predicting that 1993 revenues will increase 10 percent to $450 M, future sales will be \"adversely affected by the downward revision in Department of Defense launch plans.\" Offsetting flat launch revenues and satellite deliveries, revenues for fixed and mobile satellite services are projected to increase to $1,900 M, primarily driven by increased revenues from broadcast and cable TV networks. Similarly, remote sensing products and sales are projected to increase to $250 M in 1993 (up 15%). US COMMERCIAL SPACE REVENUES 1989 1990 1991 1992(r) 1993(e) Commercial satellites 900 1,000 1,100 1,300 670 Satellite services 750 800 1,200 1,500 1,900 Fixed (700) (735)(1,115)(1,275) (1,520) Mobile (50) ( 65)( 85)( 225) ( 380) Satellite ground equip 790 860 1,350 1,400 1,560 Mobile equipment (40) (85) (280) (352) ??? Commercial launches 150 570 380 450 450 Remote sensing data and services 125 155 190 215 250 Private microgravity research lab -- -- -- -- 60 ===== ===== ====== ===== ===== TOTAL ANNUAL REVENUES 2,715 3,385 4,220 4,815 4,890 (r) = revised data for 1992 (e) = estimated data for 1993 [Commentary: This is the first look at',\n", + " \"Original to: keithley@apple.com G'day keithley@apple.com 21 Apr 93 22:25, keithley@apple.com wrote to All: kc> keithley@apple.com (Craig Keithley), via Kralizec 3:713/602 kc> But back to the contest goals, there was a recent article in AW&ST about a kc> low cost (it's all relative...) manned return to the moon. A General kc> Dynamics scheme involving a Titan IV & Shuttle to lift a Centaur upper kc> stage, LEV, and crew capsule. The mission consists of delivering two kc> unmanned payloads to the lunar surface, followed by a manned mission. kc> Total cost: US was $10-$13 billion. Joint ESA(?)/NASA project was $6-$9 kc> billion for the US share. kc> moon for a year. Hmmm. Not really practical. Anyone got a kc> cheaper/better way of delivering 15-20 tonnes to the lunar surface within kc> the decade? Anyone have a more precise guess about how much a year's kc> supply of consumables and equipment would weigh? Why not modify the GD plan into Zurbrin's Compact Moon Direct scheme? let one of those early flight carry an O2 plant and make your own. ta Ralph\",\n", + " \"Why use a ground launch pad. It is entirely posible to launch from altitude. This was what the Shuttle was originally intended to do! It might be seriously cheaper. Also, what about bio-engineered CO2 absorbing plants instead of many LOX bottles? Stick 'em in a lunar cave and put an airlock on the door. \",\n", + " ' The gravity maneuvering that was used was to exploit \\'fuzzy regions\\'. These are described by the inventor as exploiting the second-order perturbations in a three body system. The probe was launched into this region for the earth-moon-sun system, where the perturbations affected it in such a way as to allow it to go into lunar orbit without large expenditures of fuel to slow down. The idea is that \\'natural objects sometimes get captured without expending fuel, we\\'ll just find the trajectory that makes it possible\". The originator of the technique said that NASA wasn\\'t interested, but that Japan was because their probe was small and couldn\\'t hold a lot of fuel for deceleration. This from an issue of \\'Science News\\' or \\'The Planetary Report\\' I believe, about 2 months ago(?). ',\n", + " 'Archive-name: space/probe Last-modified: $Date: 93/04/01 14:39:19 $ PLANETARY PROBES - HISTORICAL MISSIONS This section was lightly adapted from an original posting by Larry Klaes (klaes@verga.enet.dec.com), mostly minor formatting changes. Matthew Wiener (weemba@libra.wistar.upenn.edu) contributed the section on Voyager, and the section on Sakigake was obtained from ISAS material posted by Yoshiro Yamada (yamada@yscvax.ysc.go.jp). US PLANETARY MISSIONS MARINER (VENUS, MARS, & MERCURY FLYBYS AND ORBITERS) MARINER 1, the first U.S. attempt to send a spacecraft to Venus, failed minutes after launch in 1962. The guidance instructions from the ground stopped reaching the rocket due to a problem with its antenna, so the onboard computer took control. However, there turned out to be a bug in the guidance software, and the rocket promptly went off course, so the Range Safety Officer destroyed it. Although the bug is sometimes claimed to have been an incorrect FORTRAN DO statement, it was actually a transcription error in which the bar (indicating smoothing) was omitted from the expression \"R-dot-bar sub n\" (nth smoothed value of derivative of radius). This error led the software to treat normal minor variations of velocity as if they were serious, leading to incorrect compensation. MARINER 2 became the first successful probe to flyby Venus in December of 1962, and it returned information which confirmed that Venus is a very hot (800 degrees Fahrenheit, now revised to 900 degrees F.) world with a cloud-covered atmosphere composed primarily of carbon dioxide (sulfuric acid was later confirmed in 1978). MARINER 3, launched on November 5, 1964, was lost when its protective shroud failed to eject as the craft was placed into interplanetary space. Unable to collect the Sun\\'s energy for power from its solar panels, the probe soon died when its batteries ran out and is now in solar orbit. It was intended for a Mars flyby with MARINER 4. MARINER 4, the sister probe to MARINER 3, did reach Mars in 1965 and took the first close-up images of the Martian surface (22 in all) as it flew by the planet. The probe found a cratered world with an atmosphere much thinner than previously thought. Many scientists concluded from this preliminary scan that Mars was a \"dead\" world in both the geological and biological sense. MARINER 5 was sent to Venus in 1967. It reconfirmed the data on that planet collected five years earlier by MARINER 2, plus the information that Venus\\' atmospheric pressure at its surface is at least 90 times that of Earth\\'s, or the equivalent of being 3,300 feet under the surface of an ocean. MARINER 6 and 7 were sent to Mars in 1969 and expanded upon the work done by MARINER 4 four years earlier. However, they failed to take away the concept of Mars as a \"dead\" planet, first made from the basic measurements of MARINER 4. MARINER 8 ended up in the Atlantic Ocean in 1971 when the rocket launcher autopilot failed. MARINER 9, the sister probe to MARINER 8, became the first craft to orbit Mars in 1971. It returned information on the Red Planet that no other probe had done before, revealing huge volcanoes on the Martian surface, as well as giant canyon systems, and evidence that water once flowed across the planet. The probe also took the first detailed closeup images of Mars\\' two small moons, Phobos and Deimos. MARINER 10 used Venus as a gravity assist to Mercury in 1974. The probe did return the first close-up images of the Venusian atmosphere in ultraviolet, revealing previously unseen details in the cloud cover, plus the fact that the entire cloud system circles the planet in four Earth days. MARINER 10 eventually made three flybys of Mercury from 1974 to 1975 before running out of attitude control gas. The probe revealed Mercury as a heavily cratered world with a mass much greater than thought. This would seem to indicate that Mercury has an iron core which makes up 75 percent of the entire planet. PIONEER (',\n", + " 'That is an idea.. The most efficient moon habitat.. also the idea of how to get the people off the moon once the prize was won.. Also the idea of how to rescue someone who is \"dying\" on the moon. Maybe have a area where they can all \"see\" each other, and can help each other if something happens.. I liek the idea of one prize for the first moon landing and return, by a non-governmental body.. Also the idea of then having a moon habitat race.. I know we need to do somthing to get people involved.. Eccentric millionaire/billionaire would be nice.. We see how old Ross feels about it.. After all it would be a great promotional thing and a way to show he does care about commericalization and the people.. Will try to broach the subject to him.. Moonbase on the cheap is a good idea.. NASA and friends seem to take to much time and give us to expensive stuff that of late does not work (hubble and such). Basically what is the difference between a $1mil peice of junk and a multi $1mil piece of junk.. I know junk..',\n", + " 'With the continuin talk about the \"End of the Space Age\" and complaints by government over the large cost, why not try something I read about that might just work. Announce that a reward of $1 billion would go to the first corporation who successfully keeps at least 1 person alive on the moon for a year. Then you\\'d see some of the inexpensive but not popular technologies begin to be developed. THere\\'d be a different kind of space race then! ',\n", + " \"an image of the moon has been caught in a weather satellite images of the earth. it appears in both the 0430-1500UT ir and visual images of the earth. the GIF images can be down loaded from vmd.cso.uiuc.edu and are named CI043015.GIF and CV043015.GIF for the IR and visual images respectively. pretty cool pictures; in the ir it's saturated but in the visual image details on the moon are viewable. the moon is not in the 1400UT images. \",\n", + " \" It may be that they just didn't mention it, or that they actually haven't thought about it. I got the vague impression from their mission proposal that they weren't taking a very holistic aproach to the whole thing. They seemed to want to land people on the Moon by the end of the decade without explaining why, or what they would do once they got there. The only application I remember from the Av Week article was placing a telescope on the Moon. That's great, but they don't explain why it can't be done robotically. But I'm a _member_. Besides Bill, I hang out with you :) \",\n", + " 'From the article \"What\\'s New\" Apr-16-93 in sci.physics.research: ........ WHAT\\'S NEW (in my opinion), Friday, 16 April 1993 Washington, DC 1. SPACE BILLBOARDS! IS THIS ONE THE \"SPINOFFS\" WE WERE PROMISED? In 1950, science fiction writer Robert Heinlein published \"The Man Who Sold the Moon,\" which involved a dispute over the sale of rights to the Moon for use as billboard. NASA has taken the firsteps toward this hideous vision of the future. Observers were startled this spring when a NASA launch vehicle arrived at the pad with \"SCHWARZENEGGER\" painted in huge block letters on the side of the booster rockets. Space Marketing Inc. had arranged for the ad to promote Arnold\\'s latest movie. Now, Space Marketing is working with University of Colorado and Livermore engineers on a plan to place a mile-long inflatable billboard in low-earth orbit. NASA would provide contractual launch services. However, since NASA bases its charge on seriously flawed cost estimates (WN 26 Mar 93) the taxpayers would bear most of the expense. This may look like environmental vandalism, but Mike Lawson, CEO of Space Marketing, told us yesterday that the real purpose of the project is to help the environment! The platform will carry ozone monitors he explained--advertising is just to help defray costs. .......... What do you think of this revolting and hideous attempt to vandalize the night sky? It is not even April 1 anymore. What about light pollution in observations? (I read somewhere else that it might even be visible during the day, leave alone at night). Is NASA really supporting this junk? Are protesting groups being organized in the States? Really, really depressed. Enzo',\n", + " 'I have often thought about, if its possible to have a powerfull laser on earth, to light at the Moon, and show lasergraphics at the surface so clearly that you can see it with your eyes when there is a new moon. How about a Coca Cola logo at the moon, easy way to target billions of people. Do you know if its possible? ',\n", + " 'At one time there was speculation that the first spacewalk (Alexei Leonov ?) was a staged fake. Has any evidence to support or contradict this claim emerged ? Was this claim perhaps another fevered Cold War hallucination ? ',\n", + " 'From the \"JPL Universe\" April 23, 1993 VLBI project meets with international space agencies',\n", + " ' Think for a moment about the technology required to do that. By the time they could make the Earth\\'s sky look like Las Vegas, the people could afford to go backpacking on the Moon. Round trip costs for 500 kg to the Moon would be about the same as 5000 kg in a Low Earth \"advertising\" orbit: Very roughly the same cost as a smallish billboard, therefore. If such ads were to become common place, that would have to be a very low price... The night sky on a Lunar backpacking trip would still be very pristine... There\\'s always been a problem of having to get away from civilization before you can really find \"natural\" scenery. 100 years ago, this usually didn\\'t take a trip of over 5 miles. Today, most people would have to go 100 miles or more. If we ever get to the point where we have billboards on orbit, that essentially means that no place on Earth is still \"wild.\" While that may or may not be a good thing, the orbital billboards aren\\'t the problem: They are just a symptom of growing, densely-populated civilization. Banning such ads will not save your view of the night sky, because by the time such ads could become widespread you will probably have trouble finding a place without street lights, where you can _see_ the stars... An ad on a moon of Jupiter would be rather pointless, since you need a telescope to see them. However, I\\'d love to see them get all the publicity they could from underwritting the \"Coca Cola Io Orbital Mapping Probe.\" They already can, to some extent: The IAU allows names derived from sponsors or patrons of scientific research. If Microscum donates money to a university astronomy program, one of the galactic astronomers could easily get a newly discovered galaxy named after them.',\n", + " ';From the article \"What\\'s New\" Apr-16-93 in sci.physics.research: ; ;........ ;WHAT\\'S NEW (in my opinion), Friday, 16 April 1993 Washington, DC ; ;1. SPACE BILLBOARDS! IS THIS ONE THE \"SPINOFFS\" WE WERE PROMISED? ;What about light pollution in observations? (I read somewhere else that ;it might even be visible during the day, leave alone at night). ;Is NASA really supporting this junk? ;Are protesting groups being organized in the States? ;Really, really depressed. ; ; Enzo I wouldn\\'t worry about it. There\\'s enough space debris up there that a mile-long inflatable would probably deflate in some very short period of time (less than a year) while cleaning up LEO somewhat. Sort of a giant fly-paper in orbit. Hmm, that could actually be useful. As for advertising -- sure, why not? A NASA friend and I spent one drunken night figuring out just exactly how much gold mylar we\\'d need to put the golden arches of a certain American fast food organization on the face of the Moon. Fortunately, we sobered up in the morning.'],\n", + " [258,\n", + " 535,\n", + " 357,\n", + " 321,\n", + " 102,\n", + " 165,\n", + " 458,\n", + " 323,\n", + " 50,\n", + " 344,\n", + " 545,\n", + " 571,\n", + " 640,\n", + " 532,\n", + " 143,\n", + " 171,\n", + " 312,\n", + " 427,\n", + " 134,\n", + " 141])" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.pprompt(\"Which information on the keyword 'moon landing' does topic 13 have?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Their Hiten engineering-test mission spent a while in a highly eccentric Earth orbit doing lunar flybys, and then was inserted into lunar orbit using some very tricky gravity-assist-like maneuvering. This meant that it would crash on the Moon eventually, since there is no such thing as a stable lunar orbit (as far as anyone knows), and I believe I recall hearing recently that it was about to happen.\n" + ] + } + ], + "source": [ + "print(tm.topic_lis[13].documents[102])" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT wants to the call the function: {\n", + " \"name\": \"split_topic_kmeans\",\n", + " \"arguments\": \"{\\n \\\"topic_idx\\\": 6,\\n \\\"n_clusters\\\": 5\\n}\"\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "Computing word-topic matrix: 100%|██████████| 1/1 [00:01<00:00, 1.30s/it]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:03]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "100%|██████████| 1/1 [00:04<00:00, 4.34s/it]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "Computing word-topic matrix: 100%|██████████| 1/1 [00:00<00:00, 2.93it/s]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:05]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "100%|██████████| 1/1 [00:05<00:00, 5.68s/it]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:00]\n", + "Computing word-topic matrix: 100%|██████████| 1/1 [00:00<00:00, 4.42it/s]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "100%|██████████| 1/1 [00:03<00:00, 3.94s/it]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:00]\n", + "Computing word-topic matrix: 100%|██████████| 1/1 [00:00<00:00, 8.14it/s]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "100%|██████████| 1/1 [00:04<00:00, 4.51s/it]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "Computing word-topic matrix: 100%|██████████| 1/1 [00:00<00:00, 14.76it/s]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:00]\n", + "100%|██████████| 1/1 [00:04<00:00, 4.09s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here are five potential subtopics of topic 6:\n", + "\n", + "1. Existence of God: This subtopic involves questioning the existence of God and examining the evidence for and against it.\n", + "\n", + "2. Sexual Orientation: This subtopic relates to homosexuality and encompasses aspects such as sexual orientation, rights and discrimination, social attitudes, relationships and partners, and public perception.\n", + "\n", + "3. Ethics and Morality: This subtopic focuses on moral and ethical principles, including moral philosophy, moral reasoning, moral standards, moral dilemmas, and moral relativism.\n", + "\n", + "4. Religion and Law: This subtopic explores the intersection of religion and law, including beliefs, practices, interpretation, controversies, and the role of religion in society and politics.\n", + "\n", + "5. Argumentation and Atheism: This subtopic revolves around debates and arguments, involving communication, logical reasoning, disagreements, the intersection of religion and atheism, and the criticism and analysis of arguments.\n", + "\n", + "Please note that these subtopics are inferred from the information retrieved from the topic modelling results.\n" + ] + }, + { + "data": { + "text/plain": [ + "[Topic 0: Electronics Equipment Sales,\n", + " Topic 1: Image Processing,\n", + " Topic 2: Gun control,\n", + " Topic 3: Online Privacy and Anonymity,\n", + " Topic 4: Conflict and Violence.,\n", + " Topic 5: Computer Hardware,\n", + " Topic 6: Online Discussions,\n", + " Topic 7: Computer Software,\n", + " Topic 8: Car Features and Performance,\n", + " Topic 9: Encryption and Government,\n", + " Topic 10: Technology and Computing.,\n", + " Topic 11: Technology and Computing,\n", + " Topic 12: Space Exploration,\n", + " Topic 13: Motorcycle Riding Techniques,\n", + " Topic 14: Technology,\n", + " Topic 15: Hockey Games,\n", + " Topic 16: Health and Medicine.,\n", + " Topic 17: Baseball games and teams.,\n", + " Topic 18: Beliefs about Homosexuality.,\n", + " Topic 19: Existence of God,\n", + " Topic 20: Sexual Orientation,\n", + " Topic 21: Ethics and Morality,\n", + " Topic 22: Religion and Law,\n", + " Topic 23: Argumentation and Atheism.]" + ] + }, + "execution_count": 147, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.pprompt(\"What are 5 potential subtopics of topic 6\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Topic splitting" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Based on the previously identified topics, we decide to split topic 6 not into 5 but into three subtopics based on the keywords 'religious faith', 'atheism' and 'ethics and philosophy'." + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT wants to the call the function: {\n", + " \"name\": \"split_topic_keywords\",\n", + " \"arguments\": \"{\\n \\\"topic_idx\\\": 6,\\n \\\"keywords\\\": [\\\"religious faith\\\", \\\"atheism\\\", \\\"ethics and philosophy\\\"],\\n \\\"inplace\\\": true\\n}\"\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epochs completed: 100%| ██████████ 100/100 [00:00]\n", + "Computing word-topic matrix: 100%|██████████| 1/1 [00:00<00:00, 12.43it/s]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "100%|██████████| 1/1 [00:04<00:00, 4.88s/it]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:00]\n", + "Computing word-topic matrix: 100%|██████████| 1/1 [00:00<00:00, 1.42it/s]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:03]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "100%|██████████| 1/1 [00:06<00:00, 6.45s/it]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:00]\n", + "Computing word-topic matrix: 100%|██████████| 1/1 [00:00<00:00, 2.04it/s]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:02]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "100%|██████████| 1/1 [00:04<00:00, 4.19s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Topic 0: Electronics Equipment Sales\n", + ", Topic 1: Image Processing\n", + ", Topic 2: Gun control\n", + ", Topic 3: Online Privacy and Anonymity\n", + ", Topic 4: Conflict and Violence.\n", + ", Topic 5: Computer Hardware\n", + ", Topic 6: Online Discussions\n", + ", Topic 7: Computer Software\n", + ", Topic 8: Car Features and Performance\n", + ", Topic 9: Encryption and Government\n", + ", Topic 10: Technology and Computing.\n", + ", Topic 11: Technology and Computing\n", + ", Topic 12: Space Exploration\n", + ", Topic 13: Motorcycle Riding Techniques\n", + ", Topic 14: Technology\n", + ", Topic 15: Hockey Games\n", + ", Topic 16: Health and Medicine.\n", + ", Topic 17: Baseball games and teams.\n", + ", Topic 18: Beliefs about Homosexuality.\n", + ", Topic 19: Religious Beliefs\n", + ", Topic 20: Existence of God\n", + ", Topic 21: Ethics and Morality\n", + "]\n", + "Topic 6 has been split into the following subtopics based on the keywords 'religious faith', 'atheism', and 'ethics and philosophy':\n", + "\n", + "1. Subtopic: Religious Beliefs\n", + " - Description: The common topic of these words is \"Religion and Beliefs\". Aspects and sub-topics of this topic include faith and belief, religious practices, interpretation and understanding, controversies and disagreements, and the impact on society.\n", + " - Topic index: 19\n", + "\n", + "2. Subtopic: Existence of God\n", + " - Description: The common topic of the provided words is \"Atheism and Belief\". Aspects and sub-topics of this topic include atheist beliefs, atheist arguments, atheist perspective, atheist skepticism, and atheist criticism.\n", + " - Topic index: 20\n", + "\n", + "3. Subtopic: Ethics and Morality\n", + " - Description: The common topic of the given words appears to be \"Morality and Ethics\". Various aspects and sub-topics related to this topic include moral reasoning and arguments, ethical dilemmas and moral judgments, objective vs subjective morality, homosexuality and sexual ethics, and science and morality.\n", + " - Topic index: 21\n", + "\n", + "Please note that these subtopics have been created based on the provided keywords and may not capture all aspects related to the keywords.\n" + ] + }, + { + "data": { + "text/plain": [ + "[Topic 0: Electronics Equipment Sales,\n", + " Topic 1: Image Processing,\n", + " Topic 2: Gun control,\n", + " Topic 3: Online Privacy and Anonymity,\n", + " Topic 4: Conflict and Violence.,\n", + " Topic 5: Computer Hardware,\n", + " Topic 6: Online Discussions,\n", + " Topic 7: Computer Software,\n", + " Topic 8: Car Features and Performance,\n", + " Topic 9: Encryption and Government,\n", + " Topic 10: Technology and Computing.,\n", + " Topic 11: Technology and Computing,\n", + " Topic 12: Space Exploration,\n", + " Topic 13: Motorcycle Riding Techniques,\n", + " Topic 14: Technology,\n", + " Topic 15: Hockey Games,\n", + " Topic 16: Health and Medicine.,\n", + " Topic 17: Baseball games and teams.,\n", + " Topic 18: Beliefs about Homosexuality.,\n", + " Topic 19: Religious Beliefs,\n", + " Topic 20: Existence of God,\n", + " Topic 21: Ethics and Morality]" + ] + }, + "execution_count": 148, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.pprompt(\"Please split topic 6 into subtopics based on the keywords 'religious faith', 'atheism' and 'ethics and philosophy'. Do this inplace.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Topic 0: Electronics Equipment Sales,\n", + " Topic 1: Image Processing,\n", + " Topic 2: Gun control,\n", + " Topic 3: Online Privacy and Anonymity,\n", + " Topic 4: Conflict and Violence.,\n", + " Topic 5: Computer Hardware,\n", + " Topic 6: Online Discussions,\n", + " Topic 7: Computer Software,\n", + " Topic 8: Car Features and Performance,\n", + " Topic 9: Encryption and Government,\n", + " Topic 10: Technology and Computing.,\n", + " Topic 11: Technology and Computing,\n", + " Topic 12: Space Exploration,\n", + " Topic 13: Motorcycle Riding Techniques,\n", + " Topic 14: Technology,\n", + " Topic 15: Hockey Games,\n", + " Topic 16: Health and Medicine.,\n", + " Topic 17: Baseball games and teams.,\n", + " Topic 18: Beliefs about Homosexuality.,\n", + " Topic 19: Religious Beliefs,\n", + " Topic 20: Existence of God,\n", + " Topic 21: Ethics and Morality]" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.topic_lis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Combine Topics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Topics 15 and 17 both seem to be about sports, so let's merge them into one topic." + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT wants to the call the function: {\n", + " \"name\": \"combine_topics\",\n", + " \"arguments\": \"{\\n \\\"topic_idx_lis\\\": [15, 17],\\n \\\"inplace\\\": true\\n}\"\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "Computing word-topic matrix: 100%|██████████| 1/1 [00:07<00:00, 7.16s/it]\n", + "Epochs completed: 100%| ██████████ 30/30 [00:09]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:02]\n", + "100%|██████████| 1/1 [00:06<00:00, 6.62s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The topics 15 and 17 have been combined into a new topic called \"Sports\". This topic includes aspects and sub-topics related to sports such as team and players, games and seasons, performance and skills, fans and audience, and statistics and records. Some of the common words found in this topic include \"team,\" \"players,\" \"hockey,\" \"baseball,\" \"game,\" \"games,\" \"season,\" \"playoffs,\" \"good,\" \"better,\" \"win,\" \"hit,\" \"score,\" \"fans,\" \"series,\" \"watch,\" \"fan,\" \"stats,\" \"record,\" \"pts,\" and \"career\".\n" + ] + }, + { + "data": { + "text/plain": [ + "[Topic 0: Electronics Equipment Sales,\n", + " Topic 1: Image Processing,\n", + " Topic 2: Gun control,\n", + " Topic 3: Online Privacy and Anonymity,\n", + " Topic 4: Conflict and Violence.,\n", + " Topic 5: Computer Hardware,\n", + " Topic 6: Online Discussions,\n", + " Topic 7: Computer Software,\n", + " Topic 8: Car Features and Performance,\n", + " Topic 9: Encryption and Government,\n", + " Topic 10: Technology and Computing.,\n", + " Topic 11: Technology and Computing,\n", + " Topic 12: Space Exploration,\n", + " Topic 13: Motorcycle Riding Techniques,\n", + " Topic 14: Technology,\n", + " Topic 15: Health and Medicine.,\n", + " Topic 16: Beliefs about Homosexuality.,\n", + " Topic 17: Religious Beliefs,\n", + " Topic 18: Existence of God,\n", + " Topic 19: Ethics and Morality,\n", + " Topic 20: Sports]" + ] + }, + "execution_count": 153, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.pprompt(\"Please combine topics 15 and 17. Do this inplace.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete Topics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since Topic 10 and 11 have the same title, we can combine them into one topic. Note that this doesn't delete the documents from the delted topic, but rather distributes them over the other topics." + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT wants to the call the function: {\n", + " \"name\": \"delete_topic\",\n", + " \"arguments\": \"{\\n \\\"topic_idx\\\": 10,\\n \\\"inplace\\\": true\\n}\"\n", + "}\n", + "Tue Sep 5 13:50:48 2023 Building and compiling search function\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epochs completed: 100%| ██████████ 100/100 [00:02]\n", + "Computing word-topic matrix: 100%|██████████| 20/20 [02:01<00:00, 6.06s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of tfidf: (31365, 20)\n", + "shape fo word_topic_mat: (31365, 20)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epochs completed: 100%| ██████████ 30/30 [00:05]\n", + "100%|██████████| 20/20 [01:43<00:00, 5.17s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The topic with index 10 has been successfully deleted. \n", + "\n", + "After removing topic 10, the new topic we have is topic with index 19, which is related to \"Sports\". The various aspects and sub-topics of this topic include:\n", + "\n", + "1. Games: \"game\", \"games\", \"play\", \"team\", \"players\"\n", + "2. Seasons: \"year\", \"season\", \"last\", \"years\", \"playoffs\"\n", + "3. Performance: \"good\", \"better\", \"well\", \"great\", \"average\"\n", + "4. Strategies: \"think\", \"strategy\", \"tactics\", \"coach\", \"plan\"\n", + "5. Results: \"win\", \"score\", \"goal\", \"points\", \"victory\"\n" + ] + }, + { + "data": { + "text/plain": [ + "[Topic 0: Electronics equipment sales,\n", + " Topic 1: Image Processing,\n", + " Topic 2: Gun control,\n", + " Topic 3: Online Privacy,\n", + " Topic 4: Conflict and violence.,\n", + " Topic 5: Computer Hardware,\n", + " Topic 6: Anonymity in online discussions.,\n", + " Topic 7: Computer Software,\n", + " Topic 8: Car Features and Performance,\n", + " Topic 9: Encryption,\n", + " Topic 10: Technology and Computing,\n", + " Topic 11: Space Exploration,\n", + " Topic 12: Motorcycle Riding Tips,\n", + " Topic 13: Technology and Computing,\n", + " Topic 14: Healthcare and Medicine.,\n", + " Topic 15: Biblical interpretation,\n", + " Topic 16: Religious Beliefs,\n", + " Topic 17: Existence of God,\n", + " Topic 18: Morality in Health Insurance,\n", + " Topic 19: Sports]" + ] + }, + "execution_count": 159, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.pprompt(\"Please delete topic 10. Do this inplace.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare Topics " + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] C:\\Users\\arik_\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\arik_\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT wants to the call the function: {\n", + " \"name\": \"get_topic_information\",\n", + " \"arguments\": \"{\\n \\\"topic_idx_lis\\\": [5, 7]\\n}\"\n", + "}\n", + "Topic 5 is about computer hardware, while topic 7 is about computer software.\n", + "\n", + "In topic 5, the common topic is \"computer hardware,\" and it covers various aspects and sub-topics such as storage, components, performance, connectivity, and display. Some of the top words in this topic include \"drive,\" \"card,\" \"disk,\" \"memory,\" and \"video.\" It seems to be discussing various hardware-related issues, including problems, compatibility, performance, and configuration. It also mentions specific components like hard drives, cards, motherboards, and monitors. Overall, this topic focuses on the physical components of a computer system.\n", + "\n", + "On the other hand, topic 7 is about computer software and usage. It covers aspects such as operating systems, software programs, computer hardware, user interface, and troubleshooting. Some of the top words in this topic include \"file,\" \"program,\" \"windows,\" \"software,\" and \"run.\" It seems to discuss different software-related topics, including file management, application installation, error messages, and system performance. It also mentions specific software programs like file utilities and font management. This topic focuses on the software aspect of using a computer system.\n", + "\n", + "In summary, topic 5 focuses on computer hardware, while topic 7 focuses on computer software. They cover different aspects but are both related to the functioning and usage of a computer system.\n" + ] + }, + { + "data": { + "text/plain": [ + "{5: '\\n Topic index: 5\\n Topic name: Computer Hardware\\n Topic description: The common topic of the given words is \"computer hardware\". \\n\\nThe various aspects and sub-topics of this topic include:\\n1. Storage: disk, hard drive, floppy, drives, disks.\\n2. Components: card, controller, board, chip, motherboard.\\n3. Performance: speed, memory, clock, mhz, faster.\\n4. Connectivity: bus, port, connector, cable, serial.\\n5. Display: monitor, video, screen, color, graphics.\\n Topic topwords: [\"n\\'t\", \\'drive\\', \\'card\\', \\'one\\', \\'would\\', \\'use\\', \\'know\\', \\'get\\', \\'like\\', \\'disk\\', \\'system\\', \\'problem\\', \\'drives\\', \\'work\\', \\'also\\', \\'controller\\', \\'hard\\', \\'anyone\\', \\'using\\', \\'drivers\\', \\'need\\', \\'two\\', \\'monitor\\', \\'bus\\', \\'new\\', \\'used\\', \\'software\\', \\'speed\\', \\'data\\', \\'could\\', \\'think\\', \\'driver\\', \\'memory\\', \\'board\\', \\'time\\', \\'problems\\', \\'mode\\', \\'video\\', \\'port\\', \\'good\\', \\'much\\', \\'cards\\', \\'computer\\', \\'machine\\', \\'run\\', \\'modem\\', \\'want\\', \\'may\\', \\'chip\\', \\'got\\', \\'hardware\\', \\'floppy\\', \\'help\\', \\'support\\', \\'really\\', \\'even\\', \\'set\\', \\'motherboard\\', \\'power\\', \\'well\\', \\'price\\', \\'make\\', \\'cable\\', \\'works\\', \\'way\\', \\'tape\\', \\'better\\', \\'program\\', \\'faster\\', \\'back\\', \\'etc\\', \\'people\\', \\'interface\\', \\'standard\\', \\'see\\', \\'still\\', \\'since\\', \\'version\\', \\'available\\', \\'performance\\', \\'running\\', \\'buy\\', \\'printer\\', \\'read\\', \\'sure\\', \\'bit\\', \\'serial\\', \\'able\\', \\'internal\\', \\'machines\\', \\'different\\', \\'fast\\', \\'information\\', \\'something\\', \\'might\\', \\'right\\', \\'info\\', \\'question\\', \\'first\\', \\'screen\\', \\'fine\\', \\'another\\', \\'color\\', \\'many\\', \\'number\\', \\'please\\', \\'advance\\', \\'find\\', \\'getting\\', \\'though\\', \\'thing\\', \\'clock\\', \\'seems\\', \\'going\\', \\'seen\\', \\'looking\\', \\'mhz\\', \\'tried\\', \\'someone\\', \\'connector\\', \\'possible\\', \\'tell\\', \\'without\\', \\'anything\\', \\'uses\\', \\'transfer\\', \\'old\\', \\'bought\\', \\'must\\', \\'heard\\', \\'external\\', \\'either\\', \\'devices\\', \\'appreciated\\', \\'try\\', \\'pin\\', \\'sound\\', \\'second\\', \\'around\\', \\'come\\', \\'cache\\', \\'put\\', \\'least\\', \\'chips\\', \\'upgrade\\', \\'probably\\', \\'disks\\', \\'following\\', \\'high\\', \\'send\\', \\'rate\\', \\'local\\', \\'supports\\', \\'done\\', \\'called\\', \\'say\\', \\'installed\\', \\'monitors\\', \\'access\\', \\'bad\\', \\'type\\', \\'ports\\', \\'line\\', \\'windows\\', \\'graphics\\', \\'cost\\', \\'end\\', \\'last\\', \\'email\\', \\'switch\\', \\'lot\\', \\'little\\', \\'anybody\\', \\'adapter\\', \\'mail\\', \\'found\\', \\'give\\', \\'systems\\', \\'take\\', \\'boot\\', \\'device\\', \\'quality\\', \\'kind\\', \\'things\\', \\'meg\\', \\'file\\', \\'trying\\', \\'never\\', \\'boards\\', \\'order\\', \\'case\\', \\'look\\', \\'original\\', \\'comes\\', \\'display\\', \\'keyboard\\', \\'said\\', \\'mouse\\', \\'address\\', \\'best\\', \\'jumper\\', \\'recently\\', \\'via\\', \\'far\\', \\'post\\', \\'long\\', \\'seem\\', \\'believe\\', \\'pins\\', \\'change\\', \\'experience\\', \\'difference\\', \\'mac\\', \\'allow\\', \\'based\\', \\'add\\', \\'interested\\', \\'processor\\', \\'printers\\', \\'connect\\', \\'modes\\', \\'says\\', \\'times\\', \\'else\\', \\'control\\', \\'less\\', \\'pretty\\', \\'several\\', \\'working\\', \\'e-mail\\', \\'true\\', \\'extra\\', \\'thanks\\', \\'write\\', \\'jumpers\\', \\'ago\\', \\'full\\', \\'real\\', \\'quite\\', \\'low\\', \\'actually\\', \\'results\\', \\'let\\', \\'phone\\', \\'list\\', \\'check\\', \\'every\\', \\'means\\', \\'supply\\', \\'programs\\', \\'thought\\', \\'lines\\', \\'colors\\', \\'speeds\\', \\'large\\', \\'slow\\', \\'others\\', \\'higher\\', \\'backup\\', \\'heads\\', \\'told\\', \\'directly\\', \\'due\\', \\'three\\', \\'stuff\\', \\'print\\', \\'place\\', \\'per\\', \\'needs\\', \\'slave\\', \\'great\\', \\'unless\\', \\'buying\\', \\'already\\', \\'format\\', \\'prices\\', \\'slot\\', \\'built-in\\', \\'interrupt\\', \\'model\\', \\'part\\', \\'nice\\', \\'compatible\\', \\'everything\\', \\'yet\\', \\'answer\\', \\'size\\', \\'made\\', \\'point\\', \\'enough\\', \\'months\\', \\'except\\', \\'hear\\', \\'configuration\\', \\'note\\', \\'laser\\', \\'reason\\', \\'horizontal\\', \\'came\\', \\'numbers\\', \\'side\\', \\'makes\\', \\'setup\\', \\'however\\', \\'resolution\\', \\'error\\', \\'similar\\', \\'correct\\', \\'course\\', \\'couple\\', \\'single\\', \\'questions\\', \\'article\\', \\'guess\\', \\'types\\', \\'applications\\', \\'cables\\', \\'slower\\', \\'goes\\', \\'expensive\\', \\'ink\\', \\'usually\\', \\'start\\', \\'idea\\', \\'paper\\', \\'output\\', \\'solution\\', \\'wrong\\', \\'needed\\', \\'computers\\', \\'socket\\', \\'present\\', \\'allows\\', \\'plug\\', \\'runs\\', \\'maybe\\', \\'deal\\', \\'money\\', \\'settings\\', \\'open\\', \\'install\\', \\'year\\', \\'reply\\', \\'slots\\', \\'mean\\', \\'setting\\', \\'normal\\', \\'instead\\', \\'manual\\', \\'accelerator\\', \\'handle\\', \\'left\\', \\'nothing\\', \\'sell\\', \\'current\\', \\'files\\', \\'cheap\\', \\'service\\', \\'thinking\\', \\'greatly\\', \\'always\\', \\'market\\', \\'mine\\', \\'level\\', \\'errors\\', \\'ftp\\', \\'code\\', \\'purchased\\', \\'box\\', \\'user\\', \\'cheaper\\', \\'often\\', \\'requires\\', \\'include\\', \\'vertical\\', \\'optional\\', \\'burst\\', \\'wondering\\', \\'comments\\', \\'master\\', \\'worth\\', \\'main\\', \\'purchase\\', \\'feature\\', \\'years\\', \\'asked\\', \\'currently\\', \\'whether\\', \\'test\\', \\'wide\\', \\'day\\', \\'call\\', \\'example\\', \\'friend\\', \\'ones\\', \\'soon\\', \\'plus\\', \\'copy\\', \\'clear\\', \\'keep\\', \\'formatting\\', \\'free\\', \\'image\\', \\'cause\\', \\'dealer\\', \\'big\\', \\'gives\\', \\'small\\', \\'includes\\', \\'although\\', \\'maximum\\', \\'controllers\\', \\'products\\', \\'reading\\', \\'option\\', \\'provide\\', \\'inches\\', \\'almost\\', \\'rates\\', \\'built\\', \\'company\\', \\'ever\\', \\'common\\', \\'together\\', \\'name\\', \\'properly\\', \\'week\\', \\'cylinders\\', \\'talking\\', \\'section\\', \\'asynchronous\\', \\'latest\\', \\'advice\\', \\'product\\', \\'connected\\', \\'compared\\', \\'mentioned\\', \\'wait\\', \\'message\\', \\'weeks\\', \\'trouble\\', \\'newer\\', \\'third\\', \\'special\\', \\'older\\', \\'looks\\', \\'included\\', \\'hope\\', \\'signal\\', \\'features\\', \\'next\\', \\'ask\\', \\'appreciate\\', \\'transfers\\', \\'details\\', \\'bits\\', \\'bytes\\', \\'rather\\', \\'including\\', \\'fact\\', \\'parity\\', \\'hook\\', \\'general\\', \\'expansion\\', \\'require\\', \\'especially\\', \\'gets\\', \\'coprocessor\\', \\'home\\', \\'net\\', \\'operational\\', \\'sometimes\\', \\'future\\', \\'parallel\\', \\'sent\\', \\'figure\\', \\'buffer\\', \\'versions\\', \\'lower\\', \\'ideas\\', \\'top\\', \\'wires\\', \\'range\\', \\'later\\', \\'package\\', \\'went\\', \\'ram\\', \\'exactly\\', \\'share\\', \\'issue\\', \\'four\\', \\'chipset\\', \\'printing\\', \\'group\\', \\'result\\', \\'supported\\', \\'choice\\', \\'tower\\', \\'happens\\', \\'written\\']',\n", + " 7: '\\n Topic index: 7\\n Topic name: Computer Software\\n Topic description: The common topic of the given words is \"computer software and usage\". \\n\\nVarious aspects and sub-topics of this topic include:\\n1. Operating systems: Windows, DOS, virtual machines.\\n2. Software programs: file management, disk utilities, font management, application installation.\\n3. Computer hardware: memory allocation, graphics cards, printers.\\n4. User interface: command line, window display, mouse and keyboard.\\n5. Troubleshooting: error messages, software crashes, system performance.\\n Topic topwords: [\"n\\'t\", \\'file\\', \\'use\\', \\'program\\', \\'windows\\', \\'would\\', \\'one\\', \\'get\\', \\'like\\', \\'disk\\', \\'files\\', \\'know\\', \\'run\\', \\'using\\', \\'fonts\\', \\'time\\', \\'software\\', \\'think\\', \\'system\\', \\'problem\\', \\'also\\', \\'people\\', \\'copy\\', \\'computer\\', \\'help\\', \\'version\\', \\'good\\', \\'font\\', \\'running\\', \\'way\\', \\'make\\', \\'anyone\\', \\'work\\', \\'memory\\', \\'need\\', \\'much\\', \\'support\\', \\'new\\', \\'programs\\', \\'users\\', \\'even\\', \\'available\\', \\'drive\\', \\'set\\', \\'user\\', \\'want\\', \\'could\\', \\'used\\', \\'something\\', \\'etc\\', \\'see\\', \\'information\\', \\'better\\', \\'driver\\', \\'really\\', \\'higher\\', \\'may\\', \\'many\\', \\'hard\\', \\'find\\', \\'without\\', \\'machine\\', \\'apps\\', \\'well\\', \\'applications\\', \\'still\\', \\'two\\', \\'since\\', \\'protection\\', \\'data\\', \\'info\\', \\'got\\', \\'directory\\', \\'application\\', \\'keyboard\\', \\'swap\\', \\'thing\\', \\'allocation\\', \\'unit\\', \\'problems\\', \\'first\\', \\'change\\', \\'printer\\', \\'read\\', \\'say\\', \\'access\\', \\'mode\\', \\'screen\\', \\'number\\', \\'able\\', \\'linked\\', \\'mouse\\', \\'installed\\', \\'cross\\', \\'product\\', \\'things\\', \\'sure\\', \\'code\\', \\'long\\', \\'lot\\', \\'space\\', \\'hacker\\', \\'ftp\\', \\'going\\', \\'around\\', \\'take\\', \\'try\\', \\'install\\', \\'either\\', \\'point\\', \\'seen\\', \\'different\\', \\'least\\', \\'course\\', \\'though\\', \\'comes\\', \\'right\\', \\'else\\', \\'never\\', \\'full\\', \\'character\\', \\'tell\\', \\'probably\\', \\'seems\\', \\'command\\', \\'window\\', \\'look\\', \\'anything\\', \\'card\\', \\'found\\', \\'copies\\', \\'text\\', \\'compatible\\', \\'manager\\', \\'last\\', \\'every\\', \\'server\\', \\'runs\\', \\'part\\', \\'stuff\\', \\'box\\', \\'free\\', \\'upgrade\\', \\'create\\', \\'type\\', \\'question\\', \\'graphics\\', \\'products\\', \\'enough\\', \\'size\\', \\'original\\', \\'mail\\', \\'error\\', \\'floppy\\', \\'drivers\\', \\'bit\\', \\'another\\', \\'disks\\', \\'play\\', \\'give\\', \\'systems\\', \\'name\\', \\'yet\\', \\'easy\\', \\'mean\\', \\'recommended\\', \\'line\\', \\'little\\', \\'characters\\', \\'load\\', \\'someone\\', \\'message\\', \\'utility\\', \\'database\\', \\'large\\', \\'back\\', \\'sound\\', \\'standard\\', \\'done\\', \\'must\\', \\'working\\', \\'actually\\', \\'icon\\', \\'believe\\', \\'hardware\\', \\'came\\', \\'fine\\', \\'virtual\\', \\'machines\\', \\'possible\\', \\'key\\', \\'network\\', \\'might\\', \\'real\\', \\'dos\\', \\'best\\', \\'write\\', \\'typing\\', \\'come\\', \\'works\\', \\'called\\', \\'quite\\', \\'bad\\', \\'true\\', \\'environment\\', \\'pretty\\', \\'setup\\', \\'color\\', \\'wrong\\', \\'needed\\', \\'gets\\', \\'keep\\', \\'please\\', \\'later\\', \\'compressed\\', \\'major\\', \\'far\\', \\'print\\', \\'versions\\', \\'getting\\', \\'looking\\', \\'included\\', \\'email\\', \\'means\\', \\'package\\', \\'less\\', \\'always\\', \\'uses\\', \\'idea\\', \\'installation\\', \\'display\\', \\'next\\', \\'based\\', \\'said\\', \\'reading\\', \\'compiler\\', \\'post\\', \\'makes\\', \\'including\\', \\'article\\', \\'temp\\', \\'marketing\\', \\'trying\\', \\'old\\', \\'features\\', \\'world\\', \\'address\\', \\'made\\', \\'seem\\', \\'year\\', \\'example\\', \\'already\\', \\'faster\\', \\'start\\', \\'fact\\', \\'sort\\', \\'group\\', \\'times\\', \\'heard\\', \\'break\\', \\'provides\\', \\'says\\', \\'company\\', \\'often\\', \\'couple\\', \\'amount\\', \\'utilities\\', \\'everything\\', \\'person\\', \\'button\\', \\'device\\', \\'within\\', \\'remember\\', \\'money\\', \\'includes\\', \\'however\\', \\'put\\', \\'manual\\', \\'send\\', \\'great\\', \\'tried\\', \\'stop\\', \\'similar\\', \\'cview\\', \\'port\\', \\'crash\\', \\'maybe\\', \\'second\\', \\'via\\', \\'shell\\', \\'sales\\', \\'operating\\', \\'rather\\', \\'exercises\\', \\'several\\', \\'megs\\', \\'general\\', \\'normal\\', \\'years\\', \\'word\\', \\'writing\\', \\'results\\', \\'let\\', \\'hackers\\', \\'programming\\', \\'written\\', \\'allow\\', \\'game\\', \\'open\\', \\'goes\\', \\'bytes\\', \\'call\\', \\'various\\', \\'others\\', \\'tools\\', \\'small\\', \\'check\\', \\'guess\\', \\'supported\\', \\'clients\\', \\'takes\\', \\'difference\\', \\'performance\\', \\'wait\\', \\'correct\\', \\'posted\\', \\'library\\', \\'registration\\', \\'schemes\\', \\'order\\', \\'case\\', \\'interface\\', \\'anyway\\', \\'names\\', \\'legitimate\\', \\'following\\', \\'supposed\\', \\'inferior\\', \\'instance\\', \\'backup\\', \\'whatever\\', \\'document\\', \\'told\\', \\'simply\\', \\'local\\', \\'compression\\', \\'batch\\', \\'matter\\', \\'price\\', \\'instead\\', \\'pirates\\', \\'control\\', \\'resolution\\', \\'hand\\', \\'top\\', \\'certain\\', \\'wondering\\', \\'config.sys\\', \\'computers\\', \\'started\\', \\'unless\\', \\'value\\', \\'day\\', \\'useful\\', \\'choose\\', \\'companies\\', \\'section\\', \\'big\\', \\'ever\\', \\'ones\\', \\'worth\\', \\'slow\\', \\'expect\\', \\'required\\', \\'end\\', \\'enhanced\\', \\'answer\\', \\'thought\\', \\'client\\', \\'format\\', \\'kind\\', \\'high\\', \\'ago\\', \\'needs\\', \\'certainly\\', \\'almost\\', \\'opinions\\', \\'reason\\', \\'worked\\', \\'language\\', \\'e-mail\\', \\'programmers\\', \\'appreciated\\', \\'archive\\', \\'completely\\', \\'interesting\\', \\'cluster\\', \\'project\\', \\'option\\', \\'future\\', \\'nice\\', \\'usually\\', \\'important\\', \\'groups\\', \\'patch\\', \\'recently\\', \\'special\\', \\'multiple\\', \\'keys\\', \\'provide\\', \\'stable\\', \\'delete\\', \\'single\\', \\'list\\', \\'fixed\\', \\'commercial\\', \\'consider\\', \\'learn\\', \\'entry\\', \\'experience\\', \\'multitasking\\', \\'programmer\\', \\'laser\\', \\'computing\\', \\'workstation\\', \\'databases\\', \\'libraries\\', \\'reboot\\', \\'anybody\\', \\'board\\', \\'desktop\\', \\'midi\\', \\'left\\', \\'click\\', \\'larger\\', \\'add\\', \\'exactly\\', \\'packet\\', \\'path\\', \\'advance\\', \\'fix\\', \\'cache\\', \\'numbers\\', \\'site\\', \\'place\\', \\'actual\\', \\'main\\', \\'nothing\\', \\'buy\\', \\'development\\', \\'gives\\', \\'reply\\', \\'printing\\', \\'questions\\', \\'looks\\', \\'easier\\', \\'registered\\', \\'include\\', \\'starting\\', \\'agree\\', \\'current\\', \\'prevent\\', \\'properly\\', \\'decided\\', \\'job\\', \\'business\\', \\'opinion\\', \\'startup\\', \\'servers\\', \\'injuries\\', \\'i.e\\', \\'offer\\', \\'mentioned\\', \\'whether\\', \\'discussion\\', \\'ask\\', \\'service\\', \\'newsgroup\\', \\'soda.berkeley.edu\\', \\'knows\\', \\'release\\', \\'floppies\\', \\'monochrome\\', \\'slower\\', \\'industry\\', \\'update\\', \\'lots\\', \\'due\\', \\'cause\\', \\'documentation\\', \\'app\\', \\'view\\', \\'appears\\', \\'rest\\', \\'power\\', \\'changed\\', \\'interested\\', \\'dont\\', \\'requires\\', \\'breaks\\']'}" + ] + }, + "execution_count": 160, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.pprompt(\"Please compare topics 5 and 7.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add a completely new topic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also add a completely new topic based on the keyword \"Politics and government\"" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT wants to the call the function: {\n", + " \"name\": \"add_new_topic_keyword\",\n", + " \"arguments\": \"{\\n \\\"keyword\\\": \\\"Politics and the government\\\"\\n}\"\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epochs completed: 100%| ██████████ 100/100 [00:01]\n", + "Epochs completed: 100%| ██████████ 100/100 [00:03]\n", + "Computing word-topic matrix: 100%|██████████| 21/21 [01:18<00:00, 3.72s/it]\n", + "Epochs completed: 100%| ██████████ 30/30 [00:04]\n", + "100%|██████████| 21/21 [01:58<00:00, 5.64s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sure! I have added a new topic based on the keyword \"Politics and the government\". The new topic is called \"Government and Policy\".\n", + "\n", + "The common theme of the given words is \"government and public policy\". This topic encompasses various aspects and sub-topics, including:\n", + "\n", + "1. Government actions and decisions: This sub-topic focuses on the actions and decisions made by the government. It includes words like \"make\", \"said\", \"believe\", \"question\", and \"decisions\".\n", + "\n", + "2. Healthcare system: This sub-topic is centered around the healthcare system, including topics such as health, care, insurance, private healthcare, and drugs.\n", + "\n", + "3. Economy and jobs: This sub-topic explores the economy and job-related issues. It includes words like jobs, work, economic, business, and spending.\n", + "\n", + "4. Education and schools: This sub-topic pertains to education and schools. It includes words like education, school, students, learning, and teachers.\n", + "\n", + "5. Law and order: This sub-topic focuses on the legal system and maintaining law and order. It includes words like law, crime, police, legal, and punishment.\n", + "\n", + "These are just a few examples of the aspects and sub-topics that fall under the broader topic of \"Government and Policy\".\n" + ] + }, + { + "data": { + "text/plain": [ + "[Topic 0: Electronics equipment sales,\n", + " Topic 1: Image Processing,\n", + " Topic 2: Gun control,\n", + " Topic 3: Internet Privacy,\n", + " Topic 4: Conflict and Violence,\n", + " Topic 5: Computer Hardware,\n", + " Topic 6: Anonymous Posting,\n", + " Topic 7: Computer Software,\n", + " Topic 8: Car features and performance.,\n", + " Topic 9: Encryption,\n", + " Topic 10: Technology,\n", + " Topic 11: Space Exploration,\n", + " Topic 12: Motorcycle Riding Tips,\n", + " Topic 13: Technology and Computing,\n", + " Topic 14: Healthcare and Medicine,\n", + " Topic 15: Biblical interpretation,\n", + " Topic 16: Beliefs and Religion,\n", + " Topic 17: Existence of God,\n", + " Topic 18: Sexual Morality,\n", + " Topic 19: Sports,\n", + " Topic 20: Government and Policy]" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm.pprompt(\"Please add a completely new topic based on the keyword 'Politics and the government'.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "test_llm_sem1", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/LLMTopicDetection_TopicGPT/requirements.txt b/LLMTopicDetection_TopicGPT/requirements.txt new file mode 100644 index 0000000..b3b4b5c --- /dev/null +++ b/LLMTopicDetection_TopicGPT/requirements.txt @@ -0,0 +1,16 @@ +gensim +hdbscan +nltk +numpy +openai >= 1.0.0 +pandas +plotly +regex +scikit-learn +seaborn +sentence-transformers +tiktoken +tokenizers +tqdm +umap-learn +umap-learn[plot] diff --git a/LLMTopicDetection_TopicGPT/setup.py b/LLMTopicDetection_TopicGPT/setup.py new file mode 100644 index 0000000..0bbb235 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/setup.py @@ -0,0 +1,49 @@ +from setuptools import setup, find_packages + + +with open("README.md", 'r', encoding='utf') as f: + long_description = f.read() + +setup( + name='topicgpt', + version='0.0.5', + packages=find_packages(where='src'), + package_dir={'': 'src'}, + install_requires=[ + 'gensim', + 'hdbscan', + 'nltk', + 'numpy', + 'openai>=1.0.0', + 'pandas', + 'plotly', + 'regex', + 'scikit-learn', + 'seaborn', + 'sentence-transformers', + 'tiktoken', + 'tokenizers', + 'tqdm', + 'umap-learn', + 'umap-learn[plot]' + ], + include_package_data=True, + # Additional metadata + author='Arik Reuter', + author_email='arik_reuter@gmx.de', + description='A package for integrating LLMs like GPT-3.5 and GPT-4 into topic modelling', + long_description=long_description, + long_description_content_type="text/markdown", + license="MIT", + keywords=['Topic Modelling', 'GPT', 'LLM', 'OpenAI', 'Retrieval Augmented Generation', 'Chat-GPT', 'GPT-3', 'GPT-4'], + classifiers=[ + "Development Status :: 3 - Alpha", + 'Intended Audience :: Science/Research', + "Intended Audience :: Developers", + "Programming Language :: Python :: 3.11", + "Operating System :: Unix", + "Operating System :: MacOS :: MacOS X", + "Operating System :: Microsoft :: Windows", + ] +) + diff --git a/LLMTopicDetection_TopicGPT/src/topicgpt/Client.py b/LLMTopicDetection_TopicGPT/src/topicgpt/Client.py new file mode 100644 index 0000000..c102742 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/src/topicgpt/Client.py @@ -0,0 +1,12 @@ +class Client: + def __init__(self, api_key: str, azure_endpoint: dict = None) -> None: + if azure_endpoint: + from openai import AzureOpenAI + self.client = AzureOpenAI(api_key=api_key, api_version=azure_endpoint['api_version'], azure_endpoint=azure_endpoint['endpoint']) + else: + from openai import OpenAI + self.client = OpenAI(api_key=api_key) + + def __getattr__(self, name): + """Delegate attribute access to the self.client object.""" + return getattr(self.client, name) diff --git a/LLMTopicDetection_TopicGPT/src/topicgpt/Clustering.py b/LLMTopicDetection_TopicGPT/src/topicgpt/Clustering.py new file mode 100644 index 0000000..90e13d3 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/src/topicgpt/Clustering.py @@ -0,0 +1,286 @@ +import numpy as np +import umap +import hdbscan +import matplotlib.pyplot as plt +import pandas as pd +import plotly.express as px +import umap.plot +from copy import deepcopy +from sklearn.cluster import AgglomerativeClustering + +from typing import Tuple + +class Clustering_and_DimRed(): + + """ + Class to perform dimensionality reduction with UMAP followed by clustering with HDBSCAN. + """ + def __init__(self, + n_dims_umap: int = 5, + n_neighbors_umap: int = 15, + min_dist_umap: float = 0, + metric_umap: str = "cosine", + min_cluster_size_hdbscan: int = 30, + metric_hdbscan: str = "euclidean", + cluster_selection_method_hdbscan: str = "eom", + number_clusters_hdbscan: int = None, + random_state: int = 42, + verbose: bool = True, + UMAP_hyperparams: dict = {}, + HDBSCAN_hyperparams: dict = {}) -> None: + """ + Initializes the clustering and dimensionality reduction parameters for topic modeling. + + Args: + n_dims_umap (int, optional): Number of dimensions to reduce to using UMAP. + n_neighbors_umap (int, optional): Number of neighbors for UMAP. + min_dist_umap (float, optional): Minimum distance for UMAP. + metric_umap (str, optional): Metric for UMAP. + min_cluster_size_hdbscan (int, optional): Minimum cluster size for HDBSCAN. + metric_hdbscan (str, optional): Metric for HDBSCAN. + cluster_selection_method_hdbscan (str, optional): Cluster selection method for HDBSCAN. + number_clusters_hdbscan (int, optional): Number of clusters for HDBSCAN. If None, HDBSCAN will determine the number of clusters automatically. Ensure that min_cluster_size is not too large to find enough clusters. + random_state (int, optional): Random state for UMAP and HDBSCAN. + verbose (bool, optional): Whether to print progress. + UMAP_hyperparams (dict, optional): Additional hyperparameters for UMAP. + HDBSCAN_hyperparams (dict, optional): Additional hyperparameters for HDBSCAN. + """ + + + # do some checks on the input arguments + assert n_dims_umap > 0, "n_dims_umap must be greater than 0" + assert n_neighbors_umap > 0, "n_neighbors_umap must be greater than 0" + assert min_dist_umap >= 0, "min_dist_umap must be greater than or equal to 0" + assert min_cluster_size_hdbscan > 0, "min_cluster_size_hdbscan must be greater than 0" + assert number_clusters_hdbscan is None or number_clusters_hdbscan > 0, "number_clusters_hdbscan must be greater than 0 or None" + assert random_state is None or random_state >= 0, "random_state must be greater than or equal to 0" + + self.random_state = random_state + self.verbose = verbose + self.UMAP_hyperparams = UMAP_hyperparams + self.HDBSCAN_hyperparams = HDBSCAN_hyperparams + + # update hyperparameters for UMAP + self.UMAP_hyperparams["n_components"] = n_dims_umap + self.UMAP_hyperparams["n_neighbors"] = n_neighbors_umap + self.UMAP_hyperparams["min_dist"] = min_dist_umap + self.UMAP_hyperparams["metric"] = metric_umap + self.UMAP_hyperparams["random_state"] = random_state + self.UMAP_hyperparams["verbose"] = verbose + self.umap = umap.UMAP(**self.UMAP_hyperparams) + + self.HDBSCAN_hyperparams["min_cluster_size"] = min_cluster_size_hdbscan + self.HDBSCAN_hyperparams["metric"] = metric_hdbscan + self.HDBSCAN_hyperparams["cluster_selection_method"] = cluster_selection_method_hdbscan + self.number_clusters_hdbscan = number_clusters_hdbscan + self.hdbscan = hdbscan.HDBSCAN(**self.HDBSCAN_hyperparams) + + + def reduce_dimensions_umap(self, embeddings: np.ndarray) -> Tuple[np.ndarray, umap.UMAP]: + """ + Reduces dimensions of embeddings using UMAP. + + Args: + embeddings (np.ndarray): Embeddings to reduce. + + Returns: + tuple: A tuple containing two items: + - reduced_embeddings (np.ndarray): Reduced embeddings. + - umap_mapper (umap.UMAP): UMAP mapper for transforming new embeddings, especially embeddings of the vocabulary. (MAKE SURE TO NORMALIZE EMBEDDINGS AFTER USING THE MAPPER) + """ + + mapper = umap.UMAP(**self.UMAP_hyperparams).fit(embeddings) + dim_red_embeddings = mapper.transform(embeddings) + dim_red_embeddings = dim_red_embeddings/np.linalg.norm(dim_red_embeddings, axis=1).reshape(-1,1) + return dim_red_embeddings, mapper + + def cluster_hdbscan(self, embeddings: np.ndarray) -> np.ndarray: + """ + Cluster embeddings using HDBSCAN. + + If self.number_clusters_hdbscan is not None, further clusters the data with AgglomerativeClustering to achieve a fixed number of clusters. + + Args: + embeddings (np.ndarray): Embeddings to cluster. + + Returns: + np.ndarray: Cluster labels. + """ + + labels = self.hdbscan.fit_predict(embeddings) + outliers = np.where(labels == -1)[0] + + if self.number_clusters_hdbscan is not None: + clusterer = AgglomerativeClustering(n_clusters=self.number_clusters_hdbscan) #one cluster for outliers + labels = clusterer.fit_predict(embeddings) + labels[outliers] = -1 + + # reindex to make the labels consecutive numbers from -1 to the number of clusters. -1 is reserved for outliers + unique_labels = np.unique(labels) + unique_labels_no_outliers = unique_labels[unique_labels != -1] + map2newlabel = {label: i for i, label in enumerate(unique_labels_no_outliers)} + map2newlabel[-1] = -1 + labels = np.array([map2newlabel[label] for label in labels]) + + return labels + + def cluster_and_reduce(self, embeddings: np.ndarray) -> Tuple[np.ndarray, np.ndarray, umap.UMAP]: + """ + Cluster embeddings using HDBSCAN and reduce dimensions with UMAP. + + Args: + embeddings (np.ndarray): Embeddings to cluster and reduce. + + Returns: + tuple: A tuple containing three items: + - reduced_embeddings (np.ndarray): Reduced embeddings. + - cluster_labels (np.ndarray): Cluster labels. + - umap_mapper (umap.UMAP): UMAP mapper for transforming new embeddings, especially embeddings of the vocabulary. (MAKE SURE TO NORMALIZE EMBEDDINGS AFTER USING THE MAPPER) + """ + + dim_red_embeddings, umap_mapper = self.reduce_dimensions_umap(embeddings) + clusters = self.cluster_hdbscan(dim_red_embeddings) + return dim_red_embeddings, clusters, umap_mapper + + def visualize_clusters_static(self, embeddings: np.ndarray, labels: np.ndarray): + """ + Reduce dimensionality with UMAP to two dimensions and plot the clusters. + + Args: + embeddings (np.ndarray): Embeddings for which to plot clustering. + labels (np.ndarray): Cluster labels. + """ + + + # Reduce dimensionality with UMAP + reducer = umap.UMAP(n_components=2, random_state = self.random_state, n_neighbors=30, metric="cosine", min_dist=0) + embeddings_2d = reducer.fit_transform(embeddings) + + + # Create a color palette, then map the labels to the colors. + # We add one to the number of unique labels to account for the noise points labelled as -1. + palette = plt.cm.get_cmap("tab20", len(np.unique(labels)) + 1) + + # Create a new figure + fig, ax = plt.subplots(figsize=(10, 8)) + + outlier_shown_in_legend = False + + # Iterate through all unique labels (clusters and outliers) + for label in np.unique(labels): + # Find the embeddings that are part of this cluster + cluster_points = embeddings_2d[labels == label] + + # If label is -1, these are outliers. We want to display them in grey. + if label == -1: + color = 'grey' + if not outlier_shown_in_legend: + ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, label='outlier', s = 0.1) + outlier_shown_in_legend = True + else: + ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, s = 0.1) + else: + color = palette(label) + # Plot the points in this cluster without a label to prevent them from showing up in the legend + ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, s = 0.1) + + # Add a legend + ax.legend() + + # Show the plot + plt.show() + + + def visualize_clusters_dynamic(self, embeddings: np.ndarray, labels: np.ndarray, texts: list[str], class_names: list[str] = None): + """ + Visualize clusters using Plotly and enable hovering over clusters to see the beginning of the texts of the documents. + + Args: + embeddings (np.ndarray): Embeddings for which to visualize clustering. + labels (np.ndarray): Cluster labels. + texts (list[str]): Texts of the documents. + class_names (list[str], optional): Names of the classes. + """ + + + # Reduce dimensionality with UMAP + reducer = umap.UMAP(n_components=2, random_state = self.random_state, n_neighbors=30, metric="cosine", min_dist=0) + embeddings_2d = reducer.fit_transform(embeddings) + + df = pd.DataFrame(embeddings_2d, columns=['x', 'y']) + df['text'] = [text[:200] for text in texts] + df["class"] = labels + + if class_names is not None: + df["class"] = [class_names[label] for label in labels] + + # Create a color palette, then map the labels to the colors. + # Exclude the outlier (-1) label from color palette assignment + unique_labels = [label for label in np.unique(labels) if label != -1] + palette = plt.cm.get_cmap("tab20", len(unique_labels)) + + # Create color map + color_discrete_map = {label: 'rgb'+str(tuple(int(val*255) for val in palette(i)[:3])) if label != -1 else 'grey' for i, label in enumerate(unique_labels)} + color_discrete_map[-1] = 'grey' + + # plot data points where the color represents the class + fig = px.scatter(df, x='x', y='y', hover_data=['text', 'class'], color='class', color_discrete_map=color_discrete_map) + + fig.update_traces(mode='markers', marker=dict(size=3)) # Optional: Increase the marker size + + # make plot quadratic + fig.update_layout( + autosize=False, + width=1500, + height=1500, + margin=dict( + l=50, + r=50, + b=100, + t=100, + pad=4 + ) + ) + # set title + fig.update_layout(title_text='UMAP projection of the document embeddings', title_x=0.5) + + + # show plot + fig.show() + + + def umap_diagnostics(self, embeddings, hammer_edges = False): + """ + Fit UMAP on the provided embeddings and generate diagnostic plots. + + Params: + ------ + embeddings : array-like + The high-dimensional data for UMAP to reduce and visualize. + hammer_edges : bool, default False. Is computationally expensive. + + """ + new_hyperparams = deepcopy(self.UMAP_hyperparams) + new_hyperparams["n_components"] = 2 + mapper = umap.UMAP(**new_hyperparams).fit(embeddings) + + # 1. Connectivity plot with points + print("UMAP Connectivity Plot with Points") + umap.plot.connectivity(mapper, show_points=True) + plt.show() + + if hammer_edges: + # 2. Connectivity plot with edge bundling + print("UMAP Connectivity Plot with Hammer Edge Bundling") + umap.plot.connectivity(mapper, edge_bundling='hammer') + plt.show() + + # 3. PCA diagnostic plot + print("UMAP PCA Diagnostic Plot") + umap.plot.diagnostic(mapper, diagnostic_type='pca') + plt.show() + + # 4. Local dimension diagnostic plot + print("UMAP Local Dimension Diagnostic Plot") + umap.plot.diagnostic(mapper, diagnostic_type='local_dim') + plt.show() \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/src/topicgpt/ExtractTopWords.py b/LLMTopicDetection_TopicGPT/src/topicgpt/ExtractTopWords.py new file mode 100644 index 0000000..0c01b8f --- /dev/null +++ b/LLMTopicDetection_TopicGPT/src/topicgpt/ExtractTopWords.py @@ -0,0 +1,429 @@ +import nltk +import string +import collections +from tqdm import tqdm +from typing import List +import numpy as np +import re +from nltk.tokenize import word_tokenize +import umap +from collections import Counter +import warnings + +from typing import List + +# make sure the import works even if the package has not been installed and just the files are used +try: + from topicgpt.GetEmbeddingsOpenAI import GetEmbeddingsOpenAI +except: + from GetEmbeddingsOpenAI import GetEmbeddingsOpenAI + +nltk.download('stopwords', quiet=True) # download stopwords +nltk.download('punkt', quiet=True) # download tokenizer + +class ExtractTopWords: + + def extract_centroids(self, embeddings: np.ndarray, labels: np.ndarray) -> dict: + """ + Extract centroids of clusters. + + Args: + embeddings (np.ndarray): Embeddings to cluster and reduce. + labels (np.ndarray): Cluster labels. -1 means outlier. + + Returns: + dict: Dictionary of cluster labels and their centroids. + """ + + centroid_dict = {} + for label in np.unique(labels): + if label != -1: + centroid_dict[label] = np.mean(embeddings[labels == label], axis = 0) + + return centroid_dict + + def extract_centroid(self, embeddings: np.ndarray) -> np.ndarray: + """ + Extract the single centroid of a cluster. + + Args: + embeddings (np.ndarray): Embeddings to extract the centroid from. + + Returns: + np.ndarray: The centroid of the cluster. + """ + + return np.mean(embeddings, axis = 0) + + def compute_centroid_similarity(self, embeddings: np.ndarray, centroid_dict: dict, cluster_label: int) -> np.ndarray: + """ + Compute the similarity of the document embeddings to the centroid of the cluster via cosine similarity. + + Args: + embeddings (np.ndarray): Embeddings to cluster and reduce. + centroid_dict (dict): Dictionary of cluster labels and their centroids. + cluster_label (int): Cluster label for which to compute the similarity. + + Returns: + np.ndarray: Cosine similarity of the document embeddings to the centroid of the cluster. + """ + + centroid = centroid_dict[cluster_label] + similarity = np.dot(embeddings, centroid) / (np.linalg.norm(embeddings) * np.linalg.norm(centroid)) + return similarity + + def get_most_similar_docs(self, corpus: list[str], embeddings: np.ndarray, labels: np.ndarray, centroid_dict: dict, cluster_label: int, top_n: int = 10) -> List[str]: + """ + Get the most similar documents to the centroid of a cluster. + + Args: + corpus (list[str]): List of documents. + embeddings (np.ndarray): Embeddings to cluster and reduce. + labels (np.ndarray): Cluster labels. -1 means outlier. + centroid_dict (dict): Dictionary of cluster labels and their centroids. + cluster_label (int): Cluster label for which to compute the similarity. + top_n (int, optional): Number of top documents to extract. + + Returns: + List[str]: List of the most similar documents to the centroid of a cluster. + """ + + similarity = self.compute_centroid_similarity(embeddings, centroid_dict, cluster_label) + most_similar_docs = [corpus[i] for i in np.argsort(similarity)[-top_n:][::-1]] + return most_similar_docs + + def compute_corpus_vocab(self, + corpus: list[str], + remove_stopwords: bool = True, + remove_punction: bool = True, + min_word_length: int = 3, + max_word_length: int = 20, + remove_short_words: bool = True, + remove_numbers: bool = True, + verbose: bool = True, + min_doc_frequency: int = 3, + min_freq: float = 0.1, + max_freq: float = 0.9) -> list[str]: + """ + Compute the vocabulary of the corpus and perform preprocessing of the corpus. + + Args: + corpus (list[str]): List of documents. + remove_stopwords (bool, optional): Whether to remove stopwords. + remove_punction (bool, optional): Whether to remove punctuation. + min_word_length (int, optional): Minimum word length to retain. + max_word_length (int, optional): Maximum word length to retain. + remove_short_words (bool, optional): Whether to remove short words. + remove_numbers (bool, optional): Whether to remove numbers. + verbose (bool, optional): Whether to print progress and describe what is happening. + min_doc_frequency (int, optional): Minimum number of documents a word should appear in to be considered in the vocabulary. + min_freq (float, optional): Minimum frequency percentile of words to be considered in the vocabulary. + max_freq (float, optional): Maximum frequency percentile of words to be considered in the vocabulary. + + Returns: + list[str]: List of words in the corpus sorted alphabetically. + """ + + stopwords = set(nltk.corpus.stopwords.words('english')) + + word_counter = collections.Counter() + doc_frequency = collections.defaultdict(set) + + for doc_id, doc in enumerate(tqdm(corpus, disable=not verbose, desc="Processing corpus")): + words = nltk.word_tokenize(doc) + for word in words: + if remove_punction and word in string.punctuation: + continue + if remove_stopwords and word.lower() in stopwords: + continue + if remove_numbers and re.search(r'\d', word): # use a regular expression to check for digits + continue + if not re.search('[a-zA-Z]', word): # checks if word contains at least one alphabetic character + continue + # remove words that do not begin with an alphabetic character + if not word[0].isalpha(): + continue + if len(word) > max_word_length or (remove_short_words and len(word) < min_word_length): + continue + + word_lower = word.lower() + word_counter[word_lower] += 1 + doc_frequency[word_lower].add(doc_id) + + total_words = sum(word_counter.values()) + freq_counter = {word: count / total_words for word, count in word_counter.items()} + + # print most common words and their frequencies + if verbose: + print("Most common words in the vocabulary:") + for word, count in word_counter.most_common(10): + print(f"{word}: {count}") + + freq_arr = np.array(list(freq_counter.values())) + + min_freq_value = np.quantile(freq_arr, min_freq, method="lower") + max_freq_value = np.quantile(freq_arr, max_freq, method="higher") + + + vocab = {} + + for word in freq_counter.keys(): + if min_freq_value <= freq_counter[word] <= max_freq_value and len(doc_frequency[word]) >= min_doc_frequency: + vocab[word] = freq_counter[word] + + vocab = {word for word in freq_counter.keys() + if min_freq_value <= freq_counter[word] <= max_freq_value + and len(doc_frequency[word]) >= min_doc_frequency} + + # Sorting the vocabulary alphabetically + vocab = sorted(list(vocab)) + + return vocab + + def compute_words_topics(self, corpus: list[str], vocab: list[str], labels: np.ndarray) -> dict: + """ + Compute the words per topic. + + Args: + corpus (list[str]): List of documents. + vocab (list[str]): List of words in the corpus sorted alphabetically. + labels (np.ndarray): Cluster labels. -1 means outlier. + + Returns: + dict: Dictionary of topics and their words. + """ + + + # Download NLTK resources (only required once) + nltk.download("punkt") + vocab = set(vocab) + + words_per_topic = {label: [] for label in np.unique(labels) if label != -1} + + for doc, label in tqdm(zip(corpus, labels), desc="Computing words per topic", total=len(corpus)): + if label != -1: + words = word_tokenize(doc) + for word in words: + if word.lower() in vocab: + words_per_topic[label].append(word.lower()) + + return words_per_topic + + def embed_vocab_openAI(self, client, vocab: list[str], embedder: GetEmbeddingsOpenAI = None) -> dict[str, np.ndarray]: + """ + Embed the vocabulary using the OpenAI embedding API. + + Args: + client: Client. + vocab (list[str]): List of words in the corpus sorted alphabetically. + embedder (GetEmbeddingsOpenAI, optional): Embedding object. + + Returns: + dict[str, np.ndarray]: Dictionary of words and their embeddings. + """ + + vocab = sorted(list(set(vocab))) + if embedder is None: + embedder = GetEmbeddingsOpenAI.GetEmbeddingsOpenAI(client) + result = embedder.get_embeddings(vocab) + + res_dict = {} + for word, emb in zip(vocab, result["embeddings"]): + res_dict[word] = emb + return res_dict + + def compute_bow_representation(self, document: str, vocab: list[str], vocab_set: set[str]) -> np.ndarray: + """ + Compute the bag-of-words representation of a document. + + Args: + document (str): Document to compute the bag-of-words representation of. + vocab (list[str]): List of words in the corpus sorted alphabetically. + vocab_set (set[str]): Set of words in the corpus sorted alphabetically. + + Returns: + np.ndarray: Bag-of-words representation of the document. + """ + + bow = np.zeros(len(vocab)) + words = word_tokenize(document) + if vocab_set is None: + vocab_set = set(vocab) + for word in words: + if word.lower() in vocab_set: + bow[vocab.index(word.lower())] += 1 + return bow + + def compute_word_topic_mat_old(self, corpus: list[str], vocab: list[str], labels: np.ndarray, consider_outliers: bool = False) -> np.ndarray: + """ + Compute the word-topic matrix. + + Args: + corpus (list[str]): List of documents. + vocab (list[str]): List of words in the corpus sorted alphabetically. + labels (np.ndarray): Cluster labels. -1 means outlier. + consider_outliers (bool, optional): Whether to consider outliers when computing the top words. I.e. whether the labels contain -1 to indicate outliers. + + Returns: + np.ndarray: Word-topic matrix. + """ + + if consider_outliers: + word_topic_mat = np.zeros(len(vocab), len((np.unique(labels)))) + else: + word_topic_mat = np.zeros((len(vocab), len((np.unique(labels)) - 1))) + + vocab_set = set(vocab) + for i, doc in tqdm(enumerate(corpus), desc="Computing word-topic matrix", total=len(corpus)): + if labels[i] > - 0.5: + bow = self.compute_bow_representation(doc, vocab, vocab_set) + idx_to_add = labels[i] + word_topic_mat[:, idx_to_add] += bow + + return word_topic_mat + + def compute_word_topic_mat(self, corpus: list[str], vocab: list[str], labels: np.ndarray, consider_outliers=False) -> np.ndarray: + """ + Compute the word-topic matrix efficiently. + + Args: + corpus (list[str]): List of documents. + vocab (list[str]): List of words in the corpus, sorted alphabetically. + labels (np.ndarray): Cluster labels. -1 indicates outliers. + consider_outliers (bool, optional): Whether to consider outliers when computing the top words. Defaults to False. + + Returns: + np.ndarray: Word-topic matrix. + """ + + + corpus_arr = np.array(corpus) + + if consider_outliers: + word_topic_mat = np.zeros((len(vocab), len((np.unique(labels))))) + else: + word_topic_mat = np.zeros((len(vocab), len((np.unique(labels))))) + + for i, label in tqdm(enumerate(np.unique(labels)), desc="Computing word-topic matrix", total=len(np.unique(labels))): + topic_docs = corpus_arr[labels == label] + topic_doc_string = " ".join(topic_docs) + topic_doc_words = word_tokenize(topic_doc_string) + topic_doc_counter = Counter(topic_doc_words) + + word_topic_mat[:, i] = np.array([topic_doc_counter.get(word, 0) for word in vocab]) + + return word_topic_mat + + def extract_topwords_tfidf(self, word_topic_mat: np.ndarray, vocab: list[str], labels: np.ndarray, top_n_words: int = 10) -> dict: + """ + Extract the top words for each topic using a class-based tf-idf score. + + Args: + word_topic_mat (np.ndarray): Word-topic matrix. + vocab (list[str]): List of words in the corpus sorted alphabetically. + labels (np.ndarray): Cluster labels. -1 means outlier. + top_n_words (int, optional): Number of top words to extract per topic. + + Returns: + dict: Dictionary of topics and their top words. + """ + + + if min(labels) == -1: + word_topic_mat = word_topic_mat[:, 1:] + + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=RuntimeWarning) + tf = word_topic_mat / np.sum(word_topic_mat, axis=0) + idf = np.log(1 + (word_topic_mat.shape[1] / np.sum(word_topic_mat > 0, axis=1))) + + tfidf = tf * idf[:, np.newaxis] + + # set tfidf to zero if tf is nan (happens if word does not occur in any document or topic does not have any words) + tfidf[np.isnan(tf)] = 0 + + # extract top words for each topic + top_words = {} + top_word_scores = {} + for topic in np.unique(labels): + if topic != -1: + indices = np.argsort(-tfidf[:, topic])[:top_n_words] + top_words[topic] = [vocab[word_idx] for word_idx in indices] + top_word_scores[topic] = [tfidf[word_idx, topic] for word_idx in indices] + + + return top_words, top_word_scores + + def compute_embedding_similarity_centroids(self, vocab: list[str], vocab_embedding_dict: dict, umap_mapper: umap.UMAP, centroid_dict: dict, reduce_vocab_embeddings: bool = False, reduce_centroid_embeddings: bool = False) -> np.ndarray: + """ + Compute the cosine similarity of each word in the vocabulary to each centroid. + + Args: + vocab (list[str]): List of words in the corpus sorted alphabetically. + vocab_embedding_dict (dict): Dictionary of words and their embeddings. + umap_mapper (umap.UMAP): UMAP mapper to transform new embeddings in the same way as the document embeddings. + centroid_dict (dict): Dictionary of cluster labels and their centroids. -1 means outlier. + reduce_vocab_embeddings (bool, optional): Whether to reduce the vocab embeddings with the UMAP mapper. + reduce_centroid_embeddings (bool, optional): Whether to reduce the centroid embeddings with the UMAP mapper. + + Returns: + np.ndarray: Cosine similarity of each word in the vocab to each centroid. Has shape (len(vocab), len(centroid_dict) - 1). + """ + + embedding_dim = umap_mapper.n_components + centroid_arr = np.zeros((len(centroid_dict), embedding_dim)) + for i, centroid in enumerate(centroid_dict.values()): + centroid_arr[i] = centroid + if reduce_centroid_embeddings: + centroid_arr = umap_mapper.transform(centroid_arr) + + centroid_arr = centroid_arr / np.linalg.norm(centroid_arr, axis=1).reshape(-1,1) + + + org_embedding_dim = list(vocab_embedding_dict.values())[0].shape[0] + vocab_arr = np.zeros((len(vocab), org_embedding_dim)) + for i, word in enumerate(vocab): + vocab_arr[i] = vocab_embedding_dict[word] + if reduce_vocab_embeddings: + vocab_arr = umap_mapper.transform(vocab_arr) + + vocab_arr = vocab_arr / np.linalg.norm(vocab_arr, axis=1).reshape(-1,1) + + similarity = vocab_arr @ centroid_arr.T # cosine similarity + return similarity + + def extract_topwords_centroid_similarity(self, word_topic_mat: np.ndarray, vocab: list[str], vocab_embedding_dict: dict, centroid_dict: dict, umap_mapper: umap.UMAP, top_n_words: int = 10, reduce_vocab_embeddings: bool = True, reduce_centroid_embeddings: bool = False, consider_outliers: bool = False) -> tuple[dict, np.ndarray]: + """ + Extract the top words for each cluster by computing the cosine similarity of the words that occur in the corpus to the centroid of the cluster. + + Args: + word_topic_mat (np.ndarray): Word-topic matrix. + vocab (list[str]): List of words in the corpus sorted alphabetically. + vocab_embedding_dict (dict): Dictionary of words and their embeddings. + centroid_dict (dict): Dictionary of cluster labels and their centroids. -1 means outlier. + umap_mapper (umap.UMAP): UMAP mapper to transform new embeddings in the same way as the document embeddings. + top_n_words (int, optional): Number of top words to extract per topic. + reduce_vocab_embeddings (bool, optional): Whether to reduce the vocab embeddings with the UMAP mapper. + reduce_centroid_embeddings (bool, optional): Whether to reduce the centroid embeddings with the UMAP mapper. + consider_outliers (bool, optional): Whether to consider outliers when computing the top words. I.e., whether the labels contain -1 to indicate outliers. + + Returns: + dict: Dictionary of topics and their top words. + np.ndarray: Cosine similarity of each word in the vocab to each centroid. Has shape (len(vocab), len(centroid_dict) - 1). + """ + + similarity_mat = self.compute_embedding_similarity_centroids(vocab, vocab_embedding_dict, umap_mapper, centroid_dict, reduce_vocab_embeddings, reduce_centroid_embeddings) + top_words = {} + top_word_scores = {} + + if word_topic_mat.shape[1] > len(np.unique(list(centroid_dict.keys()))): + word_topic_mat = word_topic_mat[:, 1:] #ignore outliers + + for i, topic in enumerate(np.unique(list(centroid_dict.keys()))): + if topic != -1: + topic_similarity_mat = similarity_mat[:, topic] * word_topic_mat[:, topic] + top_words[topic] = [vocab[word_idx] for word_idx in np.argsort(-topic_similarity_mat)[:top_n_words]] + top_word_scores[topic] = [similarity_mat[word_idx, topic] for word_idx in np.argsort(-similarity_mat[:, topic])[:top_n_words]] + + return top_words, top_word_scores \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/src/topicgpt/GetEmbeddingsOpenAI.py b/LLMTopicDetection_TopicGPT/src/topicgpt/GetEmbeddingsOpenAI.py new file mode 100644 index 0000000..e4465a6 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/src/topicgpt/GetEmbeddingsOpenAI.py @@ -0,0 +1,217 @@ +from openai import OpenAI + +import tiktoken +from tqdm import tqdm +import numpy as np + +class GetEmbeddingsOpenAI: + """ + This class allows to compute embeddings of text using the OpenAI API. + """ + + def __init__(self, client, azure_config: dict = {}, embedding_model: str = "text-embedding-ada-002", tokenizer: str = None, max_tokens: int = 8191) -> None: + """ + Constructor of the class. + + Args: + client: Client. + embedding_model (str, optional): Name of the embedding model to use. + tokenizer (str, optional): Name of the tokenizer to use. + max_tokens (int, optional): Maximum number of tokens to use. + + Note: + By default, the embedding model "text-embedding-ada-002" is used with the corresponding tokenizer "cl100k_base" and a maximum number of tokens of 8191. + """ + + self.client = client + self.embedding_model = embedding_model + self.tokenizer_str = tokenizer + self.max_tokens = max_tokens + + @staticmethod + def num_tokens_from_string(string: str, encoding) -> int: + """ + Returns the number of tokens in a text string. + + Args: + string (str): Text string to compute the number of tokens. + encoding: A function to encode the string into tokens. + + Returns: + int: Number of tokens in the text string. + """ + num_tokens = len(encoding.encode(string)) + return num_tokens + + def compute_number_of_tokens(self, corpus: list[str]) -> int: + """ + Computes the total number of tokens needed to embed the corpus. + + Args: + corpus (list[str]): List of strings to embed, where each element in the list is a document. + + Returns: + int: Total number of tokens needed to embed the corpus. + """ + + + if self.tokenizer_str is None: + tokenizer = tiktoken.encoding_for_model(self.embedding_model) + + else: + tokenizer = tiktoken.get_encoding(self.tokenizer_str) + + num_tokens = 0 + for document in tqdm(corpus): + num_tokens += self.num_tokens_from_string(document, tokenizer) + + return num_tokens + + def split_doc(self, text): + """ + Splits a single document that is longer than the maximum number of tokens into a list of smaller documents. + + Args: + self: The instance of the class. + text (str): The string to be split. + + Returns: + List[str]: A list of strings to embed, where each element in the list is a list of chunks comprising the document. + """ + + split_text = [] + split_text.append(text[:self.max_tokens]) + for i in range(1, len(text) // self.max_tokens): + split_text.append(text[i * self.max_tokens:(i + 1) * self.max_tokens]) + split_text.append(text[(len(text) // self.max_tokens) * self.max_tokens:]) + return split_text + + def split_long_docs(self, text: list[str]) -> list[list[str]]: + """ + Splits all documents that are longer than the maximum number of tokens into a list of smaller documents. + + Args: + self: The instance of the class. + text (list[str]): List of strings to embed, where each element in the list is a document. + + Returns: + List[list[str]]: A list of lists of strings to embed, where each element in the outer list is a list of chunks comprising the document. + """ + + if self.tokenizer_str is None: + tokenizer = tiktoken.encoding_for_model(self.embedding_model) + else: + tokenizer = tiktoken.get_encoding(self.tokenizer_str) + + + split_text = [] + for document in tqdm(text): + if self.num_tokens_from_string(document, tokenizer) > self.max_tokens: + split_text.append(self.split_doc(document)) + else: + split_text.append([document]) + return split_text + + def make_api_call(self, text: str): + """ + Makes an API call to the OpenAI API to embed a text string. + + Args: + self: The instance of the class. + text (str): The string to embed. + + Returns: + API response: The response from the API. + """ + response = self.client.embeddings.create(input = [text], model = self.embedding_model) + return response + + + + def get_embeddings_doc_split(self, corpus: list[list[str]], n_tries=3) -> list[dict]: + """ + Computes the embeddings of a corpus for split documents. + + Args: + self: The instance of the class. + corpus (list[list[str]]): List of strings to embed, where each element is a document represented by a list of its chunks. + n_tries (int, optional): Number of tries to make an API call (default is 3). + + Returns: + List[dict]: A list of dictionaries, where each dictionary contains the embedding of the document, the text of the document, and a list of errors that occurred during the embedding process. + """ + + api_res_list = [] + for i in tqdm(range(len(corpus))): + chunk_lis = corpus[i] + api_res_doc = [] + for chunk_n, chunk in enumerate(chunk_lis): + + for i in range(n_tries + 1): + try: + api_res_doc.append( + {"api_res": self.make_api_call(chunk), + "error": None } + ) + break + except Exception as e: + print(f"Error {e} occured for chunk {chunk_n} of document {i}") + print(chunk) + print("Trying again.") + if i == n_tries: + print("Maximum number of tries reached. Skipping chunk.") + api_res_doc.append( + {"api_res": None, + "error": e }) + + + # average the embeddings of the chunks + emb_lis = [] + for api_res in api_res_doc: + if api_res["api_res"] is not None: + emb_lis.append(np.array(api_res["api_res"].data[0].embedding)) + text = " ".join(chunk_lis) + embedding = np.mean(emb_lis, axis = 0) + api_res_list.append( + {"embedding": embedding, + "text": text, + "errors": [api_res["error"] for api_res in api_res_doc]} + ) + return api_res_list + + def convert_api_res_list(self, api_res_list: list[dict]) -> dict: + """ + Converts the api_res list into a dictionary containing the embeddings as a matrix and the corpus as a list of strings. + + Args: + self: The instance of the class. + api_res_list (list[dict]): List of dictionaries, where each dictionary contains the embedding of the document, the text of the document, and a list of errors that occurred during the embedding process. + + Returns: + dict: A dictionary containing the embeddings as a matrix and the corpus as a list of strings. + """ + + + embeddings = np.array([api_res["embedding"] for api_res in api_res_list]) + corpus = [api_res["text"] for api_res in api_res_list] + errors = [api_res["errors"] for api_res in api_res_list] + return {"embeddings": embeddings, "corpus": corpus, "errors": errors} + + + def get_embeddings(self, corpus: list[str]) -> dict: + """ + Computes the embeddings of a corpus. + + Args: + self: The instance of the class. + corpus (list[str]): List of strings to embed, where each element in the list is a document. + + Returns: + dict: A dictionary containing the embeddings as a matrix and the corpus as a list of strings. + """ + + corpus_split = self.split_long_docs(corpus) + corpus_emb = self.get_embeddings_doc_split(corpus_split) + self.corpus_emb = corpus_emb + res = self.convert_api_res_list(corpus_emb) + return res \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/src/topicgpt/QuickTests/TestPrompting.py b/LLMTopicDetection_TopicGPT/src/topicgpt/QuickTests/TestPrompting.py new file mode 100644 index 0000000..efed7f9 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/src/topicgpt/QuickTests/TestPrompting.py @@ -0,0 +1,137 @@ +from topicgpt.TopicRepresentation import Topic + +import unittest +from sklearn.datasets import fetch_20newsgroups + +from topicgpt.TopicGPT import TopicGPT + + +import sys + + +class QuickestTopicGPT_prompting(unittest.TestCase): + """ + This class is used to mainly test the prompting functionality of the TopicGPT class. + """ + + + @classmethod + def setUpClass(cls, sample_size:int = 500): + """ + download the necessary data and only keep a sample of it + params: + client: Client. + sample_size: the number of documents to use for the test + """ + + data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) #download the 20 Newsgroups dataset + corpus = data['data']# just select the first 1000 documents for this example + corpus = [doc for doc in corpus if doc != ""] + corpus = corpus[:sample_size] + + cls.corpus = corpus + + cls.tm = TopicGPT(client = client, n_topics = 1) + cls.tm.fit(cls.corpus) + + def test_repr_topics(self): + """ + test the repr_topics function of the TopicGPT class + """ + print("Testing repr_topics...") + self.assertTrue(type(self.tm.repr_topics()) == str) + + def test_promt_knn_search(self): + """ + test the ppromt function that calls knn_search of the TopicPrompting class + """ + print("Testing ppromt_knn_search...") + + prompt_lis = ["Is topic 0 about Bananas? Use knn Search", + "Is topic 0 about Space? Use knn Search"] + + for prompt in prompt_lis: + + answer, function_result = self.tm.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result[0]) == list) + self.assertTrue(type(function_result[1]) == list) + self.assertTrue(type(function_result[0][0]) == str) + self.assertTrue(type(function_result[1][0]) == int) + + + def test_prompt_split_topic_kmeans_inplace(self): + """ + test the ppromt function that calls split_topic_kmeans of the TopicPrompting class + """ + + print("Testing ppromt_split_topic_kmeans...") + + prompt_lis = ["Split topic 0 into 2 subtopics using kmeans. Do this inplace"] + added_topic_lis_len = [2] + + old_number_of_topics = len(self.tm.topic_lis) + + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + + answer, function_result = self.tm.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + + self.assertTrue(len(self.tm.topic_lis) == old_number_of_topics + added_topic_len -1 ) + self.assertTrue(self.tm.topic_lis == function_result) + + + def test_prompt_combine_topics_inplace(self): + """ + test the prompt function that calls combine_topics of the TopicPrompting class + """ + + print("Testing ppromt_combine_topics...") + + prompt_lis = ["Combine topic 0 and topic 1 into one topic. Do this inplace"] + + # split topic first + self.tm.prompt("Please split topic 0 into two subtopic. Do this inplace.") + + old_number_topics = len(self.tm.topic_lis) + + + + for prompt in prompt_lis: + + answer, function_result = self.tm.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + print("topic_gpt_topic_list: ", self.tm.topic_lis) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(self.tm.topic_lis == function_result) + self.assertTrue(len(self.tm.topic_lis) == old_number_topics -1) + + +if __name__ == "__main__": + + for i, arg in enumerate(sys.argv): + if arg == "--api-key": + api_key = sys.argv.pop(i + 1) + sys.argv.pop(i) + break + + if api_key is None: + print("API key must be provided with --api-key") + sys.exit(1) + + + unittest.main() \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/src/topicgpt/QuickTests/TestSetup.py b/LLMTopicDetection_TopicGPT/src/topicgpt/QuickTests/TestSetup.py new file mode 100644 index 0000000..be654dc --- /dev/null +++ b/LLMTopicDetection_TopicGPT/src/topicgpt/QuickTests/TestSetup.py @@ -0,0 +1,120 @@ +from topicgpt.TopicRepresentation import Topic + +import unittest +from sklearn.datasets import fetch_20newsgroups + +from topicgpt.TopicGPT import TopicGPT + + +class QuickTestTopicGPT_init_and_fit(unittest.TestCase): + """ + Run some basic tests on TopicGPT that do not require any saved data + """ + + + @classmethod + def setUpClass(cls, sample_size:int = 500): + """ + download the necessary data and only keep a sample of it + params: + api_key: the openai api key + sample_size: the number of documents to use for the test + """ + + data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) #download the 20 Newsgroups dataset + corpus = data['data']# just select the first 1000 documents for this example + corpus = [doc for doc in corpus if doc != ""] + corpus = corpus[:sample_size] + + cls.corpus = corpus + + def setUp(self): + self.api_key_openai = api_key + + + def test_init(self): + """ + test the init function of the TopicGPT class + """ + print("Testing init...") + topicgpt = TopicGPT(api_key = self.api_key_openai) + self.assertTrue(isinstance(topicgpt, TopicGPT)) + + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 20) + self.assertTrue(isinstance(topicgpt, TopicGPT)) + + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 20, + corpus_instruction="This is a corpus instruction") + self.assertTrue(isinstance(topicgpt, TopicGPT)) + + # check if assertions are triggered + + with self.assertRaises(AssertionError): + topicgpt = TopicGPT(api_key = None, + n_topics= 32, + openai_prompting_model="gpt-4", + max_number_of_tokens=8000, + corpus_instruction="This is a corpus instruction") + + with self.assertRaises(AssertionError): + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 0, + max_number_of_tokens=8000, + corpus_instruction="This is a corpus instruction") + + with self.assertRaises(AssertionError): + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 20, + max_number_of_tokens=0, + corpus_instruction="This is a corpus instruction") + + + def test_fit(self): + """ + test the fit function of the TopicGPT class + """ + print("Testing fit...") + + def instance_test(topicgpt): + topicgpt.fit(self.corpus) + + self.assertTrue(hasattr(topicgpt, "vocab")) + self.assertTrue(hasattr(topicgpt, "topic_lis")) + + self.assertTrue(isinstance(topicgpt.vocab, list)) + self.assertTrue(isinstance(topicgpt.vocab[0], str)) + + self.assertTrue(isinstance(topicgpt.topic_lis, list)) + self.assertTrue(type(topicgpt.topic_lis[0]) == Topic) + + if topicgpt.n_topics is not None: + self.assertTrue(len(topicgpt.topic_lis) == topicgpt.n_topics) + + self.assertTrue(topicgpt.topic_lis == topicgpt.topic_prompting.topic_lis) + self.assertTrue(topicgpt.vocab == topicgpt.topic_prompting.vocab) + self.assertTrue(topicgpt.vocab_embeddings == topicgpt.topic_prompting.vocab_embeddings) + + + topicgpt1 = TopicGPT(api_key = self.api_key_openai, n_topics = 1) + + topic_gpt_list = [topicgpt1] + + for topic_gpt in topic_gpt_list: + instance_test(topic_gpt) + + +import sys + +if __name__ == "__main__": + for i, arg in enumerate(sys.argv): + if arg == "--api-key": + api_key = sys.argv.pop(i + 1) + sys.argv.pop(i) + break + + if api_key is None: + print("API key must be provided with --api-key") + sys.exit(1) + unittest.main() \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/src/topicgpt/TopicGPT.py b/LLMTopicDetection_TopicGPT/src/topicgpt/TopicGPT.py new file mode 100644 index 0000000..331017a --- /dev/null +++ b/LLMTopicDetection_TopicGPT/src/topicgpt/TopicGPT.py @@ -0,0 +1,378 @@ +import numpy as np +import os +import pickle +# make sure the import works even if the package has not been installed and just the files are used +from topicgpt.Clustering import Clustering_and_DimRed +from topicgpt.ExtractTopWords import ExtractTopWords +from topicgpt.TopwordEnhancement import TopwordEnhancement +from topicgpt.GetEmbeddingsOpenAI import GetEmbeddingsOpenAI +from topicgpt.TopicPrompting import TopicPrompting +from topicgpt.TopicRepresentation import Topic +from topicgpt.Client import Client +import topicgpt.TopicRepresentation as TopicRepresentation + + +embeddings_path= "SavedEmbeddings/embeddings.pkl" #global variable for the path to the embeddings + +class TopicGPT: + """ + This is the main class for doing topic modelling with TopicGPT. + """ + + def __init__(self, + api_key: str = "", + azure_endpoint: dict = {}, + n_topics: int = None, + openai_prompting_model: str = "gpt-3.5-turbo-16k", + max_number_of_tokens: int = 16384, + corpus_instruction: str = "", + document_embeddings: np.ndarray = None, + vocab_embeddings: dict[str, np.ndarray] = None, + embedding_model: str = "text-embedding-ada-002", + max_number_of_tokens_embedding: int = 8191, + use_saved_embeddings: bool = True, + path_saved_embeddings: str = embeddings_path, + clusterer: Clustering_and_DimRed = None, + n_topwords: int = 2000, + n_topwords_description: int = 500, + topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], + compute_vocab_hyperparams: dict = {}, + enhancer: TopwordEnhancement = None, + topic_prompting: TopicPrompting = None, + verbose: bool = True) -> None: + + """ + Initializes the main class for conducting topic modeling with TopicGPT. + + Args: + api_key (str): Your OpenAI API key. Obtain this key from https://beta.openai.com/account/api-keys. + n_topics (int, optional): Number of topics to discover. If None, the Hdbscan algorithm (https://pypi.org/project/hdbscan/) is used to determine the number of topics automatically. Otherwise, agglomerative clustering is used. Note that with insufficient data, fewer topics may be found than specified. + openai_prompting_model (str, optional): Model provided by OpenAI for topic description and prompts. Refer to https://platform.openai.com/docs/models for available models. + max_number_of_tokens (int, optional): Maximum number of tokens to use for the OpenAI API. + corpus_instruction (str, optional): Additional information about the corpus, if available, to benefit the model. + document_embeddings (np.ndarray, optional): Document embeddings for the corpus. If None, they will be computed using the OpenAI API. + vocab_embeddings (dict[str, np.ndarray], optional): Vocabulary embeddings for the corpus in a dictionary format where keys are words and values are embeddings. If None, they will be computed using the OpenAI API. + embedding_model (str, optional): Name of the embedding model to use. See https://beta.openai.com/docs/api-reference/text-embedding for available models. + max_number_of_tokens_embedding (int, optional): Maximum number of tokens to use for the OpenAI API when computing embeddings. + use_saved_embeddings (bool, optional): Whether to use saved embeddings. If True, embeddings are loaded from the file 'SavedEmbeddings/embeddings.pkl' or path_saved_embeddings if different. If False, embeddings are computed using the OpenAI API and saved to the file. + path_saved_embeddings (str, optional): Path to the saved embeddings file. + clusterer (Clustering_and_DimRed, optional): Clustering and dimensionality reduction object. Find the class in the "Clustering/Clustering" folder. If None, a clustering object with default parameters is used. Note that providing document and vocab embeddings and an embedding object at the same time is not sensible; the number of topics specified in the clusterer will overwrite the n_topics argument. + n_topwords (int, optional): Number of top words to extract and save for each topic. Note that fewer top words might be used later. + n_topwords_description (int, optional): Number of top words to provide to the LLM (Language Model) to describe the topic. + topword_extraction_methods (list[str], optional): List of methods for extracting top words. Available methods include "tfidf", "cosine_similarity", and "topword_enhancement". Refer to the file 'ExtractTopWords/ExtractTopWords.py' for more details. + compute_vocab_hyperparams (dict, optional): Hyperparameters for computing vocabulary embeddings. Refer to the file 'ExtractTopWords/ExtractTopWords.py' for more details. + enhancer (TopwordEnhancement, optional): Topword enhancement object. Used for describing topics. Find the class in the "TopwordEnhancement/TopwordEnhancement.py" folder. If None, a topword enhancement object with default parameters is used. If an openai model is specified here, it will overwrite the openai_prompting_model argument for topic description. + topic_prompting (TopicPrompting, optional): Topic prompting object for formulating prompts. Find the class in the "TopicPrompting/TopicPrompting.py" folder. If None, a topic prompting object with default parameters is used. If an openai model is specified here, it will overwrite the openai_prompting_model argument for topic description. + verbose (bool, optional): Whether to print detailed information about the process. This can be overridden by arguments in passed objects. + """ + + + + # Do some checks on the input arguments + assert api_key is not None, "You need to provide an OpenAI API key." + assert n_topics is None or n_topics > 0, "The number of topics needs to be a positive integer." + assert max_number_of_tokens > 0, "The maximum number of tokens needs to be a positive integer." + assert max_number_of_tokens_embedding > 0, "The maximum number of tokens for the embedding model needs to be a positive integer." + assert n_topwords > 0, "The number of top words needs to be a positive integer." + assert n_topwords_description > 0, "The number of top words for the topic description needs to be a positive integer." + assert len(topword_extraction_methods) > 0, "You need to provide at least one topword extraction method." + assert n_topwords_description <= n_topwords, "The number of top words for the topic description needs to be smaller or equal to the number of top words." + + self.client = Client(api_key = api_key, azure_endpoint = azure_endpoint) + + + self.n_topics = n_topics + self.openai_prompting_model = openai_prompting_model + self.max_number_of_tokens = max_number_of_tokens + self.corpus_instruction = corpus_instruction + self.document_embeddings = document_embeddings + self.vocab_embeddings = vocab_embeddings + self.embedding_model = embedding_model + self.max_number_of_tokens_embedding = max_number_of_tokens_embedding + self.embedder = GetEmbeddingsOpenAI(client = self.client, embedding_model = self.embedding_model, max_tokens = self.max_number_of_tokens_embedding) + self.clusterer = clusterer + self.n_topwords = n_topwords + self.n_topwords_description = n_topwords_description + self.topword_extraction_methods = topword_extraction_methods + self.compute_vocab_hyperparams = compute_vocab_hyperparams + self.enhancer = enhancer + self.topic_prompting = topic_prompting + self.use_saved_embeddings = use_saved_embeddings + self.verbose = verbose + + self.compute_vocab_hyperparams["verbose"] = self.verbose + + # if embeddings have already been downloaded to the folder SavedEmbeddings, then load them + if self.use_saved_embeddings and os.path.exists(path_saved_embeddings): + with open(path_saved_embeddings, "rb") as f: + self.document_embeddings, self.vocab_embeddings = pickle.load(f) + + for elem in topword_extraction_methods: + assert elem in ["tfidf", "cosine_similarity", "topword_enhancement"], "Invalid topword extraction method. Valid methods are 'tfidf', 'cosine_similarity', and 'topword_enhancement'." + + if clusterer is None: + self.clusterer = Clustering_and_DimRed(number_clusters_hdbscan = self.n_topics, verbose = self.verbose) + else: + self.n_topics = clusterer.number_clusters_hdbscan + + if enhancer is None: + self.enhancer = TopwordEnhancement(client = self.client, openai_model = self.openai_prompting_model, max_context_length = self.max_number_of_tokens, corpus_instruction = self.corpus_instruction) + + if topic_prompting is None: + self.topic_prompting = TopicPrompting(topic_lis = [], client = self.client, openai_prompting_model = self.openai_prompting_model, max_context_length_promting = 16000, enhancer = self.enhancer, openai_embedding_model = self.embedding_model, max_context_length_embedding = self.max_number_of_tokens_embedding, corpus_instruction = corpus_instruction) + + self.extractor = ExtractTopWords() + + def __repr__(self) -> str: + repr = "TopicGPT object with the following parameters:\n" + repr += "-"*150 + "\n" + repr += "n_topics: " + str(self.n_topics) + "\n" + repr += "openai_prompting_model: " + self.openai_prompting_model + "\n" + repr += "max_number_of_tokens: " + str(self.max_number_of_tokens) + "\n" + repr += "corpus_instruction: " + self.corpus_instruction + "\n" + repr += "embedding_model: " + self.embedding_model + "\n" + repr += "clusterer: " + str(self.clusterer) + "\n" + repr += "n_topwords: " + str(self.n_topwords) + "\n" + repr += "n_topwords_description: " + str(self.n_topwords_description) + "\n" + repr += "topword_extraction_methods: " + str(self.topword_extraction_methods) + "\n" + repr += "compute_vocab_hyperparams: " + str(self.compute_vocab_hyperparams) + "\n" + repr += "enhancer: " + str(self.enhancer) + "\n" + repr += "topic_prompting: " + str(self.topic_prompting) + "\n" + + return repr + + def compute_embeddings(self, corpus: list[str]) -> tuple[np.ndarray, dict[str, np.ndarray]]: + """ + Computes document and vocabulary embeddings for the given corpus. + + Args: + corpus (list[str]): List of strings to embed, where each element is a document. + + Returns: + tuple: A tuple containing two items: + - document_embeddings (np.ndarray): Document embeddings for the corpus, with shape (len(corpus), n_embedding_dimensions). + - vocab_embeddings (dict[str, np.ndarray]): Vocabulary embeddings for the corpus, provided as a dictionary where keys are words and values are embeddings. + """ + + + self.document_embeddings = self.embedder.get_embeddings(corpus)["embeddings"] + + self.vocab_embeddings = self.extractor.embed_vocab_openAI(self.client, self.vocab, embedder = self.embedder) + + return self.document_embeddings, self.vocab_embeddings + + def extract_topics(self, corpus: list[str]) -> list[Topic]: + """ + Extracts topics from the given corpus. + + Args: + corpus (list[str]): List of strings to process, where each element represents a document. + + Returns: + list[Topic]: A list of Topic objects representing the extracted topics. + """ + + assert self.document_embeddings is not None and self.vocab_embeddings is not None, "You need to compute the embeddings first." + + if self.vocab is None: + self.vocab = self.extractor.compute_corpus_vocab(self.corpus, **self.compute_vocab_hyperparams) + + self.topic_lis = TopicRepresentation.extract_topics_no_new_vocab_computation( + corpus = corpus, + vocab = self.vocab, + document_embeddings = self.document_embeddings, + clusterer = self.clusterer, + vocab_embeddings = self.vocab_embeddings, + n_topwords = self.n_topwords, + topword_extraction_methods = self.topword_extraction_methods, + consider_outliers = True + ) + + return self.topic_lis + + def describe_topics(self, topics: list[Topic]) -> list[Topic]: + """ + Names and describes the provided topics using the OpenAI API. + + Args: + topics (list[Topic]): List of Topic objects to be named and described. + + Returns: + list[Topic]: A list of Topic objects with names and descriptions. + """ + + + assert self.topic_lis is not None, "You need to extract the topics first." + + if "cosine_similarity" in self.topword_extraction_methods: + topword_method = "cosine_similarity" + elif "tfidf" in self.topword_extraction_methods: + topword_method = "tfidf" + else: + raise ValueError("You need to use either 'cosine_similarity' or 'tfidf' as topword extraction method.") + + self.topic_lis = TopicRepresentation.describe_and_name_topics( + topics = topics, + enhancer = self.enhancer, + topword_method= topword_method, + n_words = self.n_topwords_description + ) + + return self.topic_lis + + def fit(self, corpus: list[str], verbose: bool = True): + """ + Compute embeddings if necessary, extract topics, and describe them. + + Args: + corpus (list[str]): List of strings to embed, where each element represents a document. + verbose (bool, optional): Whether to print the progress and details of the process. + """ + + self.corpus = corpus + + # remove empty documents + len_before_removing = len(self.corpus) + while '' in self.corpus: + corpus.remove('') + len_after_removing = len(self.corpus) + if verbose: + print("Removed " + str(len_before_removing - len_after_removing) + " empty documents.") + + if self.vocab_embeddings is None: + if verbose: + print("Computing vocabulary...") + + self.vocab = self.extractor.compute_corpus_vocab(self.corpus, **self.compute_vocab_hyperparams) + else: + print('Vocab already computed') + self.vocab = list(self.vocab_embeddings.keys()) + + if self.vocab_embeddings is None or self.document_embeddings is None: + if verbose: + print("Computing embeddings...") + self.compute_embeddings(corpus = self.corpus) + else: + print('Embeddings already computed') + if verbose: + print("Extracting topics...") + self.topic_lis = self.extract_topics(corpus = self.corpus) + + if verbose: + print("Describing topics...") + self.topic_lis = self.describe_topics(topics = self.topic_lis) + + self.topic_prompting.topic_lis = self.topic_lis + self.topic_prompting.vocab_embeddings = self.vocab_embeddings + self.topic_prompting.vocab = self.vocab + + def visualize_clusters(self): + """ + Visualizes the identified clusters representing the topics in a scatterplot. + """ + + assert self.topic_lis is not None, "You need to extract the topics first." + + all_document_embeddings = np.concatenate([topic.document_embeddings_hd for topic in self.topic_lis], axis = 0) + all_texts = np.concatenate([topic.documents for topic in self.topic_lis], axis = 0) + all_document_indices = np.concatenate([np.repeat(i, topic.document_embeddings_hd.shape[0]) for i, topic in enumerate(self.topic_lis)], axis = 0) + class_names = [str(topic) for topic in self.topic_lis] + + self.clusterer.visualize_clusters_dynamic(all_document_embeddings, all_document_indices, all_texts, class_names) + + def repr_topics(self) -> str: + """ + Returns a string explanation of the topics. + """ + + assert self.topic_lis is not None, "You need to extract the topics first." + + if "cosine_similarity" in self.topword_extraction_methods: + topword_method = "cosine_similarity" + elif "tfidf" in self.topword_extraction_methods: + topword_method = "tfidf" + else: + raise ValueError("You need to use either 'cosine_similarity' or 'tfidf' as topword extraction method.") + + repr = "" + for topic in self.topic_lis: + repr += str(topic) + "\n" + repr += "Topic_description: " + topic.topic_description + "\n" + repr += "Top words: " + str(topic.top_words[topword_method][:10]) + "\n" + repr += "\n" + repr += "-"*150 + "\n" + + return repr + + def print_topics(self): + """ + Prints a string explanation of the topics. + """ + + print(self.repr_topics()) + + def prompt(self, query: str) -> tuple[str, object]: + """ + Prompts the model with the given query. + + Args: + query (str): The query to prompt the model with. + + Returns: + tuple: A tuple containing two items: + - answer (str): The answer from the model. + - function_result (object): The result of the function call. + + Note: + Please refer to the TopicPrompting class for more details on available functions for prompting the model. + """ + + + result = self.topic_prompting.general_prompt(query) + + answer = result[0][-1].choices[0].message.content + function_result = result[1] + self.topic_prompting._fix_dictionary_topwords() + self.topic_lis = self.topic_prompting.topic_lis + + return answer, function_result + + def pprompt(self, query: str, return_function_result: bool = True) -> object: + """ + Prompts the model with the given query and prints the answer. + + Args: + query (str): The query to prompt the model with. + return_function_result (bool, optional): Whether to return the result of the function call by the Language Model (LLM). + + Returns: + object: The result of the function call if return_function_result is True, otherwise None. + """ + + + answer, function_result = self.prompt(query) + + print(answer) + + if return_function_result: + return function_result + + def save_embeddings(self, path: str = embeddings_path) -> None: + """ + Saves the document and vocabulary embeddings to a pickle file for later re-use. + + Args: + path (str, optional): The path to save the embeddings to. Defaults to embeddings_path. + """ + + + assert self.document_embeddings is not None and self.vocab_embeddings is not None, "You need to compute the embeddings first." + + # create dictionary if it doesn't exist yet + if not os.path.exists("SavedEmbeddings"): + os.makedirs("SavedEmbeddings") + + + with open(path, "wb") as f: + pickle.dump([self.document_embeddings, self.vocab_embeddings], f) + diff --git a/LLMTopicDetection_TopicGPT/src/topicgpt/TopicPrompting.py b/LLMTopicDetection_TopicGPT/src/topicgpt/TopicPrompting.py new file mode 100644 index 0000000..01550b8 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/src/topicgpt/TopicPrompting.py @@ -0,0 +1,1271 @@ +import openai +from openai import OpenAI +import numpy as np +import json +import tiktoken +import openai +from openai import OpenAI +import re +import sklearn +import hdbscan +from copy import deepcopy + +# make sure the import works even if the package has not been installed and just the files are used +try: + from topicgpt.TopicRepresentation import Topic + from topicgpt.TopicRepresentation import extract_and_describe_topic_cos_sim + from topicgpt.TopicRepresentation import extract_describe_topics_labels_vocab + from topicgpt.TopwordEnhancement import TopwordEnhancement +except: + from TopicRepresentation import Topic + from TopicRepresentation import extract_and_describe_topic_cos_sim + from TopicRepresentation import extract_describe_topics_labels_vocab + from TopwordEnhancement import TopwordEnhancement + + +basic_model_instruction = """You are a helpful assistant. +You are excellent at inferring information about topics discovered via topic modelling using information retrieval. +You summarize information intelligently. +You use the functions you are provided with if applicable. +You make sure that everything you output is strictly based on the provided text. If you cite documents, give their indices. +You always explicitly say if you don't find any useful information! +You only say that something is contained in the corpus if you are very sure about it!""" + + +class TopicPrompting: + """ + This class allows to formulate prompts and queries against the identified topics to get more information about them + """ + + def __init__(self, + topic_lis: list[Topic], + client, + openai_prompting_model: str = "gpt-3.5-turbo-16k", + max_context_length_promting: int = 16000, + openai_model_temperature_prompting: float = 0.5, + openai_embedding_model: str = "text-embedding-ada-002", + max_context_length_embedding: int = 8191, + basic_model_instruction: str = basic_model_instruction, + corpus_instruction: str = "", + enhancer: TopwordEnhancement = None, + vocab: list = None, + vocab_embeddings: dict = None, + random_state: int = 42): + """ + Initialize the object. + + Args: + topic_list (list[Topic]): List of Topic objects. + client: Client. + openai_prompting_model (str, optional): OpenAI model to use for prompting (default is "gpt-3.5-turbo-16k"). + max_context_length_prompting (int, optional): Maximum context length for the prompting model (default is 16000). + openai_model_temperature_prompting (float, optional): Temperature for the prompting model (default is 0.5). + openai_embedding_model (str, optional): OpenAI model to use for computing embeddings for similarity search (default is "text-embedding-ada-002"). + max_context_length_embedding (int, optional): Maximum context length for the embedding model (default is 8191). + basic_model_instruction (str, optional): Basic instruction for the prompting model. + corpus_instruction (str, optional): Instruction for the prompting model to use the corpus. + enhancer (TopwordEnhancement, optional): TopwordEnhancement object for naming and describing the topics (default is None). + vocab (list, optional): Vocabulary of the corpus (default is None). + vocab_embeddings (dict, optional): Dictionary mapping words to their embeddings (default is None). + random_state (int, optional): Random state for reproducibility (default is 42). + """ + + self.topic_lis = topic_lis + self.client = client + self.openai_prompting_model = openai_prompting_model + self.max_context_length_promting = max_context_length_promting + self.openai_model_temperature_prompting = openai_model_temperature_prompting + self.openai_embedding_model = openai_embedding_model + self.max_context_length_embedding = max_context_length_embedding + self.basic_model_instruction = basic_model_instruction + self.corpus_instruction = f" The following information is available about the corpus used to identify the topics: {corpus_instruction}.\n" + self.enhancer = enhancer + self.vocab = vocab + self.vocab_embeddings = vocab_embeddings + self.random_state = random_state + + + self.function_descriptions = { + "knn_search": { + "name": "knn_search", + "description": "This function is the best choice to find out if a topic is about a specific subject or keyword or aspects or contains information about it. It should also be used to infer the subtopics of a given topic. Note that it is possible that just useless documents are returned.", + "parameters": { + "type": "object", + "properties": { + "topic_index": { + "type": "integer", + "description": "index of the topic to search in." + }, + "query": { + "type": "string", + "description": "query string. Can be a single word or a sentence. Used to create an embedding and search a vector database for the k nearest neighbors." + }, + "k": { + "type": "integer", + "description": "number of neighbors to return. Use more neighbors to get a more diverse and comprehensive set of results." + } + }, + "required": ["topic_index", "query"] + + } + }, + "identify_topic_idx": { + "name": "identify_topic_idx", + "description": "This function can be used to identify the index of the topic that the query is most likely about. This is useful if the topic index is needed for other functions. It should NOT be used to find more detailed information on topics. Note that it is possible that the model does not find any topic that fits the query. In this case, the function returns None.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "query string. Can be a single word or a sentence. Used to find the index of the topic that is most likely about the query." + } + }, + "required": ["query"] + + } + }, + "split_topic_kmeans": { + "name": "split_topic_kmeans", + "description": "This function can be used to split a topic into several subtopics using kmeans clustering. Only use this function to actually split topics. The subtopics do not need to be specified and are found automatically via clustering. It returns the topics the original topic has been split into.", + "parameters": { + "type": "object", + "properties": { + "topic_idx": { + "type": "integer", + "description": "index of the topic to split." + }, + "n_clusters": { + "type": "integer", + "description": "number of clusters to split the topic into. The more clusters, the more fine-grained the splitting. Typically 2 clusters are used.", + "default": 2 + }, + "inplace": { + "type": "boolean", + "description": "if True, the topic is split inplace. Otherwise, a new list of topics is created and returned. ALWAYS set inplace to False unless something else is explicitly requested!", + "default": False + } + }, + "required": ["topic_idx"] + } + }, + "split_topic_keywords": { + "name": "split_topic_keywords", + "description": "This function can be used to split a topic into subtopics according to the keywords. I.e. a topic about 'machine learning' can be split into a topic about 'supervised learning' and a topic about 'unsupervised learning'. This is achieved by computing the cosine similarity between the keywords and the documents in the topic.", + "parameters": { + "type": "object", + "properties": { + "topic_idx": { + "type": "integer", + "description": "index of the topic to split." + }, + "keywords": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 2, + "description": "keywords to form new subtopics to replace old topic. Needs to be a list of at least two keywords." + }, + "inplace": { + "type": "boolean", + "description": "if True, the topic is split inplace. Otherwise, a new list of topics is created and returned. ALWAYS set inplace to False unless something else is explicitly requested!", + "default": False + } + }, + "required": ["topic_idx", "keywords"] + } + }, + "split_topic_single_keyword": { + "name": "split_topic_single_keyword", + "description": "This function can be used to split a topic into the main topic and an additional subtopic. I.e. a topic about 'machine learning' can be split into a topic about 'machine learning' and a topic about 'supervised learning.", + "parameters": { + "type": "object", + "properties": { + "topic_idx": { + "type": "integer", + "description": "index of the topic to split." + }, + "keyword": { + "type": "string", + "description": "keyword to form new subtopic besides old main topic. Needs to be a single keyword." + }, + "inplace": { + "type": "boolean", + "description": "if True, the topic is split inplace. Otherwise, a new list of topics is created and returned. ALWAYS set inplace to False unless something else is explicitly requested!", + "default": False + } + }, + "required": ["topic_idx", "keyword"] + } + }, + "combine_topics": { + "name": "combine_topics", + "description": "This function can be used to combine several topics into one topic. It returns the newly formed topic and removes the old topics from the list of topics.", + "parameters": { + "type": "object", + "properties": { + "topic_idx_lis": { + "type": "array", + "items": { + "type": "integer" + }, + "minItems": 2, + "description": "list of topic indices to combine." + }, + "inplace": { + "type": "boolean", + "description": "if True, the topic is split inplace. Otherwise, a new list of topics is created and returned. ALWAYS set inplace to False unless something else is explicitly requested!", + "default": False + } + }, + "required": ["topic_idx_lis"] + } + }, + "add_new_topic_keyword": { + "name": "add_new_topic_keyword", + "description": "This function can be used to globally create a new topic based on a keyword. This is useful if the keyword is not contained in any of the topics. The new topic is created by finding the documents that are closest to the keyword and then taking away those documents from the other topics. Note that this method is computationally expensive and should only be used if splitting another topic is unavoidable.", + "parameters": { + "type": "object", + "properties": { + "keyword": { + "type": "string", + "description": "keyword to form new topic. Needs to be a single keyword." + }, + "inplace": { + "type": "boolean", + "description": "if True, the topic is split inplace. Otherwise, a new list of topics is created and returned. ALWAYS set inplace to False unless something else is explicitly requested!", + "default": False + } + + }, + "required": ["keyword"] + } + }, + "delete_topic": { + "name": "delete_topic", + "description": "This function can be used to delete a topic and assign the documents of this topic to the other topics based on centroid similarity. This is useful if the topic is not needed anymore. Note that this method is computationally expensive.", + "parameters": { + "type": "object", + "properties": { + "topic_idx": { + "type": "integer", + "description": "index of the topic to delete." + }, + "inplace": { + "type": "boolean", + "description": "if True, the topic is split inplace. Otherwise, a new list of topics is created and returned. ALWAYS set inplace to False unless something else is explicitly requested!", + "default": False + } + + }, + "required": ["topic_idx"] + } + }, + "get_topic_information": { + "name": "get_topic_information", + "description": "This function can be used to get information about several topics. This function can be used to COMPARE topics or to get an overview over them. It returns a list of dictionaries containing the topic index and information about the topics.", + "parameters": { + "type": "object", + "properties": { + "topic_idx_lis": { + "type": "array", + "items": { + "type": "integer" + }, + "minItems": 1, + "description": "list of topic indices to get information about." + } + }, + "required": ["topic_idx_lis"] + } + }, + "split_topic_hdbscan": { + "name": "split_topic_hdbscan", + "description": "This function can be used to split a topic into several subtopics using hdbscan clustering. This method should be used if the number of clusters to split the topic into is not known.", + "parameters": { + "type": "object", + "properties": { + "topic_idx": { + "type": "integer", + "description": "index of the topic to split." + }, + "min_cluster_size": { + "type": "integer", + "description": "minimum number of documents in a cluster. The higher the number, the more fine-grained the splitting.", + "default": 10 + }, + "inplace": { + "type": "boolean", + "description": "if True, the topic is split inplace. Otherwise, a new list of topics is created and returned. ALWAYS set inplace to False unless something else is explicitly requested!", + "default": False + } + }, + "required": ["topic_idx"] + } + } + } + + self.functionNames2Functions = { + "knn_search": self._knn_search_openai, + "identify_topic_idx": self._identify_topic_idx_openai, + "split_topic_kmeans": self._split_topics_kmeans_openai, + "split_topic_keywords": self._split_topic_keywords_openai, + "split_topic_single_keyword": self._split_topic_single_keyword_openai, + "combine_topics": self._combine_topics_openai, + "add_new_topic_keyword": self._add_new_topic_keyword_openai, + "delete_topic": self._delete_topic_openai, + "get_topic_information": self._get_topic_information_openai, + "split_topic_hdbscan": self._split_topic_hdbscan_openai + } + + def reindex_topics(self) -> None: + """ + Reindexes the topics in self.topic_list to assign correct new indices. + + This method updates the indices of topics within the instance's topic list to ensure they are correctly ordered. + + Returns: + None + """ + + for idx, topic in enumerate(self.topic_lis): + topic.topic_idx = idx + + def reindex_topic_lis(self, topic_list: list[Topic]) -> list[Topic]: + """ + Reindexes the topics in the provided topic list to assign correct new indices. + + This method updates the indices of topics within the given topic list to ensure they are correctly ordered. + + Args: + topic_list (list[Topic]): The list of Topic objects to reindex. + + Returns: + list[Topic]: The reindexed list of Topic objects. + """ + + for idx, topic in enumerate(topic_list): + topic.topic_idx = idx + return topic_list + + def show_topic_lis(self) -> str: + """ + Returns a string representation of the list of topics. + + This method generates a human-readable string representation of the topics in the instance's topic list. + + Returns: + str: A string containing the representation of the list of topics. + """ + + self.reindex_topics() + res = "" + for idx, topic in enumerate(self.topic_lis): + res += str(topic) + + print(res) + + def get_topic_lis(self) -> list[Topic]: + """ + Returns the list of topics stored in the instance. + + This method retrieves and returns the list of topics associated with the instance. + + Returns: + list[Topic]: The list of Topic objects. + """ + + self.reindex_topics() + return self.topic_lis + + def set_topic_lis(self, topic_list: list[Topic]) -> None: + """ + Sets the list of topics for the instance. + + This method updates the list of topics associated with the instance to the provided list. + + Args: + topic_list (list[Topic]): The list of Topic objects to set. + + Returns: + None + """ + + self.topic_lis = topic_list + self.reindex_topics() + + def knn_search(self, topic_index: int, query: str, k: int = 20, doc_cutoff_threshold: int = 1000) -> tuple[list[str], list[int]]: + """ + Finds the k nearest neighbors of the query in the given topic based on cosine similarity in the original embedding space. + + Args: + topic_index (int): Index of the topic to search within. + query (str): Query string. + k (int, optional): Number of neighbors to return (default is 20). + doc_cutoff_threshold (int, optional): Maximum number of tokens per document. Afterwards, the document is cut off (default is 1000). + + Returns: + tuple: A tuple containing two lists - + - A list of top k documents (as strings). + - A list of indices corresponding to the top k documents in the topic. + """ + + topic = self.topic_lis[topic_index] + + query_embedding = self.client.embeddings.create(input = [query], model = self.openai_embedding_model)["data"][0]["embedding"] + + query_similarities = topic.document_embeddings_hd @ query_embedding / (np.linalg.norm(topic.document_embeddings_hd, axis = 1) * np.linalg.norm(query_embedding)) + + topk_doc_indices = np.argsort(query_similarities)[::-1][:k] + topk_docs = [topic.documents[i] for i in topk_doc_indices] + + # cut off documents that are too long + max_number_tokens = self.max_context_length_promting - len(tiktoken.encoding_for_model(self.openai_prompting_model).encode(self.basic_model_instruction + " " + self.corpus_instruction)) - 100 + n_tokens = 0 + for i, doc in enumerate(topk_docs): + encoded_doc = tiktoken.encoding_for_model(self.openai_prompting_model).encode(doc) + n_tokens += len(encoded_doc[:doc_cutoff_threshold]) + if n_tokens > max_number_tokens: + topk_docs = topk_docs[:i] + topk_doc_indices = topk_doc_indices[:i] + break + if len(encoded_doc) > doc_cutoff_threshold: + encoded_doc = encoded_doc[:doc_cutoff_threshold] + topk_docs[i] = tiktoken.encoding_for_model(self.openai_prompting_model).decode(encoded_doc) + + + + + return topk_docs, [int(elem) for elem in topk_doc_indices] + + def prompt_knn_search(self, llm_query: str, topic_index: int = None, n_tries: int = 3) -> tuple[str, tuple[list[str], list[int]]]: + """ + Uses the Language Model (LLM) to answer the llm_query based on the documents belonging to the topic. + + Args: + llm_query (str): Query string for the Language Model (LLM). + topic_index (int, optional): Index of the topic object. If None, the topic is inferred from the query. + n_tries (int, optional): Number of tries to get a valid response from the LLM (default is 3). + + Returns: + tuple: A tuple containing two elements - + - A string representing the answer from the LLM. + - A tuple containing two lists - + - A list of top k documents (as strings). + - A list of indices corresponding to the top k documents in the topic. + """ + + messages = [ + { + "role": "system", + "content": self.basic_model_instruction + " " + self.corpus_instruction + }, + { + "role": "user", + "content": llm_query + } + ] + for _ in range(n_tries): + try: + response_message = self.client.chat.completions.create(model = self.openai_prompting_model, + messages = messages, + functions = [self.function_descriptions["knn_search"]], + function_call = "auto")["choices"][0]["message"] + + # Step 2: check if GPT wanted to call a function + function_call = response_message.get("function_call") + if function_call is not None: + #print("GPT wants to the call the function: ", function_call) + # Step 3: call the function + # Note: the JSON response may not always be valid; be sure to handle errors + + function_name = function_call["name"] + function_to_call = self.functionNames2Functions[function_name] + function_args = json.loads(function_call["arguments"]) + if topic_index is not None: + function_args["topic_index"] = topic_index + function_response = function_to_call(**function_args) + function_response_json = function_response[0] + function_response_return_output = function_response[1] + + + + # Step 4: send the info on the function call and function response to GPT + messages.append(response_message) # extend conversation with assistant's reply + + + messages.append( + { + "role": "function", + "name": function_name, + "content": function_response_json, + } + ) # extend conversation with function response + + #print(messages) + second_response = self.client.chat.completions.create(model=self.openai_prompting_model, + messages=messages) # get a new response from GPT where it can see the function response + except (TypeError, ValueError, openai.APIError, openai.APIConnectionError) as error: + print("Error occured: ", error) + print("Trying again...") + + return second_response, function_response_return_output + + def identify_topic_idx(self, query: str, n_tries: int = 3) -> int: + """ + Identifies the index of the topic that the query is most likely about. + + This method uses a Language Model (LLM) to determine which topic best fits the query description. If the LLM does not find any topic that fits the query, None is returned. + + Args: + query (str): Query string. + n_tries (int, optional): Number of tries to get a valid response from the LLM (default is 3). + + Returns: + int: The index of the topic that the query is most likely about. If no suitable topic is found, None is returned. + """ + + + topic_descriptions_str = "" + for i, topic in enumerate(self.topic_lis): + description = topic.topic_description + description = f"""Topic index: {i}: \n {description} \n \n""" + topic_descriptions_str += description + + system_prompt = f"""You are a helpful assistant.""" + + user_prompt = f""" Please find the index of the topic that is about the following query: {query}. + Those are the given topics: '''{topic_descriptions_str}'''. + Please make sure to reply ONLY with an integer number between 0 and {len(self.topic_lis) - 1}! + Reply with -1 if you don't find any topic that fits the query! + Always explicitly say if you don't find any useful information by replying with -1! If in doubt, say that you did not find any useful information! + Reply in the following format: "The topic index is: """ + + messages = [ + { + "role": "system", + "content": system_prompt + }, + { + "role": "user", + "content": user_prompt + } + ] + for _ in range(n_tries): + try: + response_message = self.client.chat.completions.create(model = self.openai_prompting_model, + messages = messages)["choices"][0]["message"] + + except (TypeError, ValueError, openai.APIError, openai.APIConnectionError) as error: + print("Error occured: ", error) + print("Trying again...") + + + + response_text = response_message["content"] + # find integer number in response text + try: + match = re.search(r'(-?\d+)', response_text) + topic_index = int(match.group(1)) + except: # in case the LLM does not find any topic that fits the query, return None + topic_index = None + + + if topic_index is None: + raise ValueError("No integer number found in response text! The model gave the following response: ", response_text) + + if topic_index == -1: + return None + else: + return topic_index + + def split_topic_new_assignments(self, topic_idx: int, new_topic_assignments: np.ndarray, inplace: bool = False) -> list[Topic]: + """ + Splits a topic into new topics based on new topic assignments. + + Note that this method only computes topwords based on the cosine-similarity method because tf-idf topwords need expensive computation on the entire corpus. + The topwords of the old topic are also just split among the new ones. No new topwords are computed in this step. + + Args: + topic_idx (int): Index of the topic to split. + new_topic_assignments (np.ndarray): New topic assignments for the documents in the topic. + inplace (bool, optional): If True, the topic is split in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + list of Topic: A list of new topics resulting from the split. + """ + + + if self.vocab_embeddings is None: + raise(ValueError("Need to provide vocab_embeddings to Topic prompting class to split a topic!")) + if self.enhancer is None: + raise(ValueError("Need to provide enhancer to Topic prompting class to split a topic!")) + + vocab_embedding_dict = self.vocab_embeddings + enhancer = self.enhancer + + old_topic = self.topic_lis[topic_idx] + + assert len(new_topic_assignments) == len(old_topic.documents), "new_topic_assignments must have the same length as the number of documents in the topic!" + + # create new topics + new_topics = [] + for i in np.unique(new_topic_assignments): + docs = [old_topic.documents[j] for j in range(len(old_topic.documents)) if new_topic_assignments[j] == i] + docs_embeddings = old_topic.document_embeddings_hd[new_topic_assignments == i] + words_raw = [] + for doc in docs: + words_raw += doc.split(" ") + words_raw = set(words_raw) + words = [word for word in old_topic.words if word in words_raw] + + new_topic = extract_and_describe_topic_cos_sim( + documents_topic = docs, + document_embeddings_topic = docs_embeddings, + words_topic = words, + vocab_embeddings = vocab_embedding_dict, + umap_mapper = old_topic.umap_mapper, + enhancer=enhancer, + n_topwords = 2000 + ) + new_topic.topic_idx = len(self.topic_lis) + i + 1 + new_topics.append(new_topic) + + new_topic_lis = self.topic_lis.copy() + new_topic_lis.pop(topic_idx) + new_topic_lis += new_topics + new_topic_lis = self.reindex_topic_lis(new_topic_lis) + + if inplace: + self.topic_lis = new_topic_lis + + return new_topic_lis + + def split_topic_kmeans(self, topic_idx: int, n_clusters: int = 2, inplace: bool = False) -> list[Topic]: + """ + Splits an existing topic into several subtopics using k-means clustering on the document embeddings of the topic. + + Note that no new topwords are computed in this step, and the topwords of the old topic are just split among the new ones. Additionally, only the cosine-similarity method for topwords extraction is used. + + Args: + topic_idx (int): Index of the topic to split. + n_clusters (int, optional): Number of clusters to split the topic into (default is 2). + inplace (bool, optional): If True, the topic is split in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + list of Topic: A list of new topics resulting from the split. + """ + + + old_topic = self.topic_lis[topic_idx] + embeddings = old_topic.document_embeddings_ld # embeddings to split into clusters + + kmeans_res = sklearn.cluster.KMeans(n_clusters = n_clusters, random_state = self.random_state, n_init = "auto").fit(embeddings) + cluster_labels = kmeans_res.labels_ + new_topics = self.split_topic_new_assignments(topic_idx, cluster_labels, inplace) + + return new_topics + + def split_topic_hdbscan(self, topic_idx: int, min_cluster_size: int = 100, inplace: bool = False) -> list[Topic]: + """ + Splits an existing topic into several subtopics using HDBSCAN clustering on the document embeddings of the topic. + + This method does not require specifying the number of clusters to split. Note that no new topwords are computed in this step, and the topwords of the old topic are just split among the new ones. Additionally, only the cosine-similarity method for topwords extraction is used. + + Args: + topic_idx (int): Index of the topic to split. + min_cluster_size (int, optional): Minimum cluster size to split the topic into (default is 100). + inplace (bool, optional): If True, the topic is split in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + list of Topic: A list of new topics resulting from the split. + """ + + + old_topic = self.topic_lis[topic_idx] + embeddings = old_topic.document_embeddings_ld + + clusterer = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size, prediction_data = True) + clusterer.fit(embeddings) + cluster_labels = clusterer.labels_ + new_topics = self.split_topic_new_assignments(topic_idx, cluster_labels, inplace) + + new_topics = self.reindex_topic_lis(new_topics) + + if inplace: + self.topic_lis = new_topics + + return new_topics + + def split_topic_keywords(self, topic_idx: int, keywords: str, inplace: bool = False) -> list[Topic]: + """ + Splits the topic into subtopics according to the provided keywords. + + This is achieved by computing the cosine similarity between the keywords and the documents in the topic. Note that no new topwords are computed in this step, and the topwords of the old topic are just split among the new ones. Additionally, only the cosine-similarity method for topwords extraction is used. + + Args: + topic_idx (int): Index of the topic to split. + keywords (str): Keywords to split the topic into. Needs to be a list of at least two keywords. + inplace (bool, optional): If True, the topic is split in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + list of Topic: A list of new topics resulting from the split. + """ + + assert len(keywords) > 1, "Need at least two keywords to split the topic! Otherwise use the split_topic_single_keyword function!" + keyword_embeddings = [] + for keyword in keywords: + keyword_embeddings.append(self.client.embeddings.create(input = [keyword], model = self.openai_embedding_model)["data"][0]["embedding"]) + keyword_embeddings = np.array(keyword_embeddings) + + old_topic = self.topic_lis[topic_idx] + document_embeddings = old_topic.document_embeddings_hd + + document_embeddings = document_embeddings / np.linalg.norm(document_embeddings, axis = 1)[:, np.newaxis] + keyword_embeddings = keyword_embeddings / np.linalg.norm(keyword_embeddings, axis = 1)[:, np.newaxis] + similarities = document_embeddings @ keyword_embeddings.T + new_topic_assignments = np.argmax(similarities, axis = 1) + + # if the topic cannot be split, i.e. all documents are assigned the same label, raise an error + if len(np.unique(new_topic_assignments)) == 1: + raise ValueError(f"The topic cannot be split into the subtopics {keywords}. All documents are assigned the same label!") + + new_topics = self.split_topic_new_assignments(topic_idx, new_topic_assignments, inplace = inplace) + + new_topics = self.reindex_topic_lis(new_topics) + + if inplace: + self.topic_lis = new_topics + + return new_topics + + def split_topic_single_keyword(self, topic_idx: int, keyword: str, inplace: bool = False) -> list[Topic]: + """ + Splits the topic with a single keyword. + + This method splits the topic such that all documents closer to the original topic name stay in the old topic, while all documents closer to the keyword are moved to the new topic. Note that no new topwords are computed in this step, and the topwords of the old topic are just split among the new ones. Additionally, only the cosine-similarity method for topwords extraction is used. + + Args: + topic_idx (int): Index of the topic to split. + keyword (str): Keyword to split the topic into. + inplace (bool, optional): If True, the topic is split in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + list of Topic: A list of new topics resulting from the split. + """ + + keywords = [self.topic_lis[topic_idx].topic_name, keyword] + + res = self.split_topic_keywords(topic_idx, keywords, inplace) + + return res + + def combine_topics(self, topic_idx_lis: list[int], inplace: bool = False) -> list[Topic]: + """ + Combines several topics into one topic. + + This method combines the specified topics into a single topic. Note that no new topwords are computed in this step, and the topwords of the old topics are just combined. Additionally, only the cosine-similarity method for topwords extraction is used. + + Args: + topic_idx_list (list[int]): List of topic indices to combine. + inplace (bool, optional): If True, the topics are combined in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + list of Topic: A list of new topics resulting from the combination. + """ + + new_topic_docs = [] + new_topic_words = [] + new_topic_document_embeddings_hd = [] + + for topic_idx in topic_idx_lis: + topic = self.topic_lis[topic_idx] + new_topic_docs += topic.documents + new_topic_words += topic.words + new_topic_document_embeddings_hd.append(topic.document_embeddings_hd) + + new_topic_document_embeddings_hd = np.concatenate(new_topic_document_embeddings_hd, axis = 0) + + new_topic = extract_and_describe_topic_cos_sim( + documents_topic = new_topic_docs, + document_embeddings_topic = new_topic_document_embeddings_hd, + words_topic = new_topic_words, + vocab_embeddings = self.vocab_embeddings, + umap_mapper = self.topic_lis[0].umap_mapper, + enhancer=self.enhancer, + n_topwords = 2000 + ) + + new_topic.topic_idx = len(self.topic_lis) + 1 + new_topic_lis = self.topic_lis.copy() + + for topic_idx in sorted(topic_idx_lis, reverse = True): + new_topic_lis.pop(topic_idx) + new_topic_lis.append(new_topic) + new_topic_lis = self.reindex_topic_lis(new_topic_lis) + + + if inplace: + self.topic_lis = new_topic_lis + self.reindex_topics() + + return new_topic_lis + + def add_new_topic_keyword(self, keyword: str, inplace: bool = False, rename_new_topic: bool = False) -> list[Topic]: + """ + Create a new topic based on a keyword and recompute topic topwords. + + This method removes all documents belonging to other topics from them and adds them to the new topic. It computes new topwords using both the tf-idf and the cosine-similarity method. + + Args: + keyword (str): Keyword to create the new topic from. + inplace (bool, optional): If True, the topic is updated in place. Otherwise, a new list of topics is created and returned (default is False). + rename_new_topic (bool, optional): If True, the new topic is renamed to the keyword (default is False). + + Returns: + list of Topic: A list of new topics, including the newly created topic and the modified old ones. + """ + + umap_mapper = self.topic_lis[0].umap_mapper + + keyword_embedding_hd = self.client.embeddings.create(input = [keyword], model = self.openai_embedding_model)["data"][0]["embedding"] + keyword_embedding_hd = np.array(keyword_embedding_hd).reshape(1, -1) + keyword_embedding_ld = umap_mapper.transform(keyword_embedding_hd)[0] + + old_centroids_ld = [] + for topic in self.topic_lis: + old_centroids_ld.append(topic.centroid_ld) + old_centroids_ld = np.array(old_centroids_ld) + + # assign documents to new centroid (keyword_embedding_ld) iff they are closer to the new centroid than to their old centroid + + new_doc_topic_assignments = [] + doc_lis = [] + + new_topic_idx = len(self.topic_lis) + for i, topic in enumerate(self.topic_lis): + doc_lis += topic.documents + document_embeddings = topic.document_embeddings_ld + cos_sim_old_centroid = document_embeddings @ old_centroids_ld[i] / (np.linalg.norm(document_embeddings, axis = 1) * np.linalg.norm(old_centroids_ld[i])) + cos_sim_new_centroid = document_embeddings @ keyword_embedding_ld / (np.linalg.norm(document_embeddings, axis = 1) * np.linalg.norm(keyword_embedding_ld)) + new_centroid_is_closer = cos_sim_new_centroid > cos_sim_old_centroid + + new_document_assignments = np.where(new_centroid_is_closer, new_topic_idx, i) + new_doc_topic_assignments.append(new_document_assignments) + + new_doc_topic_assignments = np.concatenate(new_doc_topic_assignments, axis = 0) + + assert len(doc_lis) == len(new_doc_topic_assignments), "Number of documents must be equal to the number of document assignments!" + + new_embeddings_hd = [] + new_embeddings_ld = [] + + for topic in self.topic_lis: + new_embeddings_hd.append(topic.document_embeddings_hd) + new_embeddings_ld.append(topic.document_embeddings_ld) + + new_embeddings_hd = np.concatenate(new_embeddings_hd, axis = 0) + new_embeddings_ld = np.concatenate(new_embeddings_ld, axis = 0) + + new_topics = extract_describe_topics_labels_vocab( + corpus = doc_lis, + document_embeddings_hd = new_embeddings_hd, + document_embeddings_ld = new_embeddings_ld, + labels = new_doc_topic_assignments, + vocab = self.vocab, + umap_mapper = umap_mapper, + vocab_embeddings = self.vocab_embeddings, + enhancer = self.enhancer + ) + + if rename_new_topic: + new_topics[-1].topic_name = keyword + + new_topics = self.reindex_topic_lis(new_topics) + + if inplace: + self.topic_lis = new_topics + + return new_topics + + def delete_topic(self, topic_idx: int, inplace: bool = False) -> list[Topic]: + """ + Deletes a topic with the given index from the list of topics and recomputes topwords and representations of the remaining topics. + + This method assigns the documents of the deleted topic to the remaining topics. + + Args: + topic_idx (int): Index of the topic to delete. + inplace (bool, optional): If True, the topic is deleted in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + list of Topic: A list of new topics resulting from the deletion. + """ + + + topic_lis_new = deepcopy(self.topic_lis) + topic_lis_new.pop(topic_idx) + + old_centroids_ld = [] + for topic in topic_lis_new: + old_centroids_ld.append(topic.centroid_ld) + + old_centroids_ld = np.array(old_centroids_ld) + + document_embeddings_ld = [] + + for topic in self.topic_lis: + document_embeddings_ld.append(topic.document_embeddings_ld) + + document_embeddings_ld = np.concatenate(document_embeddings_ld, axis = 0) # has shape (n_documents, n_topics) + + centroid_similarities = document_embeddings_ld @ old_centroids_ld.T / (np.linalg.norm(document_embeddings_ld, axis = 1)[:, np.newaxis] * np.linalg.norm(old_centroids_ld, axis = 1)) + new_topic_assignments = np.argmax(centroid_similarities, axis = 1) + + new_embeddings_hd = [] + new_embeddings_ld = [] + + for topic in self.topic_lis: + new_embeddings_hd.append(topic.document_embeddings_hd) + new_embeddings_ld.append(topic.document_embeddings_ld) + + new_embeddings_hd = np.concatenate(new_embeddings_hd, axis = 0) + new_embeddings_ld = np.concatenate(new_embeddings_ld, axis = 0) + + doc_lis = [] + for topic in self.topic_lis: + doc_lis += topic.documents + + + + new_topics = extract_describe_topics_labels_vocab( + corpus = doc_lis, + document_embeddings_hd = new_embeddings_hd, + document_embeddings_ld = new_embeddings_ld, + labels = new_topic_assignments, + vocab = self.vocab, + umap_mapper = self.topic_lis[0].umap_mapper, + vocab_embeddings = self.vocab_embeddings, + enhancer = self.enhancer + ) + + new_topics = self.reindex_topic_lis(new_topics) + + if inplace: + self.topic_lis = new_topics + + return new_topics + + def get_topic_information(self, topic_idx_lis: list[int], max_number_topwords: int = 500) -> dict: + """ + Get detailed information on topics by their indices. + + This function returns a dictionary where the keys are the topic indices, and the values are strings describing the topics. The description includes a maximum of max_number_topwords topwords. + + Args: + topic_idx_list (list[int]): List of topic indices to compare. + max_number_topwords (int, optional): Maximum number of topwords to include in the description of the topics (default is 500). + + Returns: + dict: A dictionary with topic indices as keys and their descriptions as values. + """ + + max_number_tokens = self.max_context_length_promting - len(tiktoken.encoding_for_model(self.openai_prompting_model).encode(self.basic_model_instruction + " " + self.corpus_instruction)) - 100 + + topic_info = {} # dictionary with the topic indices as keys and the topic descriptions as values + + for topic_idx in topic_idx_lis: + topic = self.topic_lis[topic_idx] + topic_info[topic_idx] = topic.topic_description + + topic_str = f""" + Topic index: {topic_idx} + Topic name: {topic.topic_name} + Topic description: {topic.topic_description} + Topic topwords: {topic.top_words["cosine_similarity"][:max_number_topwords]}""" + + topic_info[topic_idx] = topic_str + + # prune all topic descriptions to the maximum number of tokens by taking away the last word until the description fits + + max_number_tokens_per_topic = max_number_tokens // len(topic_idx_lis) + tiktoken_encodings = {idx: tiktoken.encoding_for_model(self.openai_prompting_model).encode(topic_info[idx]) for idx in topic_idx_lis} + pruned_encodings = {idx: tiktoken_encodings[idx][:max_number_tokens_per_topic] for idx in topic_idx_lis} + + topic_info = {idx: tiktoken.encoding_for_model(self.openai_prompting_model).decode(pruned_encodings[idx]) for idx in topic_idx_lis} + + return topic_info + + def _knn_search_openai(self, topic_index: int, query: str, k: int = 20) -> tuple[str, (list[str], list[int])]: + """ + A version of the knn_search function that returns a JSON file to be used with the OpenAI API. + + Args: + topic_index (int): Index of the topic to search in. + query (str): Query string. + k (int, optional): Number of neighbors to return (default is 20). + + Returns: + json: JSON object to be used with the OpenAI API. + tuple: A tuple containing two lists - + - A list of top k documents (as strings). + - A list of indices corresponding to the top k documents in the topic. + """ + + topk_docs, topk_doc_indices = self.knn_search(topic_index, query, k) + json_obj = json.dumps({ + "top-k documents": topk_docs, + "indices of top-k documents": list(topk_doc_indices) + }) + return json_obj, (topk_docs, topk_doc_indices) + + def _identify_topic_idx_openai(self, query: str, n_tries: int = 3) -> tuple[str, int]: + """ + A version of the identify_topic_idx function that returns a JSON file to be used with the OpenAI API. + + Args: + query (str): Query string. + n_tries (int, optional): Number of tries to get a valid response from the LLM (default is 3). + + Returns: + json: JSON object to be used with the OpenAI API. + int: The topic index. + """ + + topic_index = self.identify_topic_idx(query, n_tries) + json_obj = json.dumps({ + "topic index": topic_index + }) + return json_obj, topic_index + + def _split_topic_hdbscan_openai(self, topic_idx: int, min_cluster_size: int = 10, inplace: bool = False) -> tuple[str, list[Topic]]: + """ + A version of the split_topic_hdbscan function that returns a JSON file to be used with the OpenAI API. + + Args: + topic_idx (int): Index of the topic to split. + min_cluster_size (int, optional): Minimum cluster size to split the topic into (default is 10). + inplace (bool, optional): If True, the topic is split in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + json: JSON object to be used with the OpenAI API. + list of Topic: A list of new topics resulting from the split. + """ + + new_topics = self.split_topic_hdbscan(topic_idx, min_cluster_size, inplace) + json_obj = json.dumps({ + "new topics": [topic.to_dict() for topic in new_topics][-len(new_topics):] + }) + return json_obj, new_topics + + def _split_topics_kmeans_openai(self, topic_idx: list[int], n_clusters: int = 2, inplace: bool = False) -> tuple[str, list[Topic]]: + """ + A version of the split_topic_kmeans function that returns a JSON file to be used with the OpenAI API. + + Args: + topic_idx (list[int]): List of indices of the topics to split. + n_clusters (int, optional): Number of clusters to split each topic into (default is 2). + inplace (bool, optional): If True, the topics are split in place. Otherwise, new lists of topics are created and returned (default is False). + + Returns: + json: JSON object to be used with the OpenAI API. + list of Topic: A list of new topics resulting from the split. + """ + + new_topics = self.split_topic_kmeans(topic_idx, n_clusters, inplace) + json_obj = json.dumps({ + "new topics": [topic.to_dict() for topic in new_topics][-n_clusters:] + }) + return json_obj, new_topics + + def _split_topic_keywords_openai(self, topic_idx: int, keywords: str, inplace: bool = False) -> tuple[str, list[Topic]]: + """ + A version of the split_topic_keywords function that returns a JSON file to be used with the OpenAI API. + + Args: + topic_idx (int): Index of the topic to split. + keywords (str): Keywords to split the topic into. Needs to be a list of at least two keywords. + inplace (bool, optional): If True, the topic is split in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + json: JSON object to be used with the OpenAI API. + list of Topic: A list of new topics resulting from the split. + """ + + new_topics = self.split_topic_keywords(topic_idx, keywords, inplace) + json_obj = json.dumps({ + "new topics": [topic.to_dict() for topic in new_topics][-len(keywords):] + }) + return json_obj, new_topics + + def _split_topic_single_keyword_openai(self, topic_idx: int, keyword: str, inplace: bool = False) -> tuple[str, list[Topic]]: + """ + A version of the split_topic_single_keyword function that returns a JSON file to be used with the OpenAI API. + + Args: + topic_idx (int): Index of the topic to split. + keyword (str): Keyword to split the topic into. + inplace (bool, optional): If True, the topic is split in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + json: JSON object to be used with the OpenAI API. + list of Topic: A list of new topics resulting from the split. + """ + + new_topics = self.split_topic_single_keyword(topic_idx, keyword, inplace) + json_obj = json.dumps({ + "new topics": [topic.to_dict() for topic in new_topics][-2:] + }) + return json_obj, new_topics + + def _combine_topics_openai(self, topic_idx_lis: list[int], inplace: bool = False) -> tuple[str, list[Topic]]: + """ + A version of the combine_topics function that returns a JSON file to be used with the OpenAI API. + + Args: + topic_idx_lis (list[int]): List of topic indices to combine. + inplace (bool, optional): If True, the topics are combined in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + json: JSON object to be used with the OpenAI API. + list of Topic: A list of new topics resulting from the combination. + """ + + new_topics = self.combine_topics(topic_idx_lis, inplace) + json_obj = json.dumps({ + "new topics": [topic.to_dict() for topic in new_topics][-1] + }) + return json_obj, new_topics + + def _add_new_topic_keyword_openai(self, keyword: str, inplace: bool = False, rename_new_topic: bool = False) -> tuple[str, list[Topic]]: + """ + A version of the add_new_topic_keyword function that returns a JSON file to be used with the OpenAI API. + + Args: + keyword (str): Keyword to create the new topic from. + inplace (bool, optional): If True, the topic is split in place. Otherwise, a new list of topics is created and returned (default is False). + rename_new_topic (bool, optional): If True, the new topic is renamed to the keyword (default is False). + + Returns: + json: JSON object to be used with the OpenAI API. + list of Topic: A list of new topics resulting from the operation. + """ + + new_topics = self.add_new_topic_keyword(keyword, inplace, rename_new_topic) + json_obj = json.dumps({ + "new topics": [topic.to_dict() for topic in new_topics][-1] + }) + return json_obj, new_topics + + def _delete_topic_openai(self, topic_idx: int, inplace: bool = False) -> tuple[str, list[Topic]]: + """ + A version of the delete_topic function that returns a JSON file to be used with the OpenAI API. + + Args: + topic_idx (int): Index of the topic to delete. + inplace (bool, optional): If True, the topic is deleted in place. Otherwise, a new list of topics is created and returned (default is False). + + Returns: + json: JSON object to be used with the OpenAI API. + list of Topic: A list of topics after the deletion operation. + """ + + new_topics = self.delete_topic(topic_idx, inplace) + json_obj = json.dumps({ + f"Topics after deleting the one with index {topic_idx}": [topic.to_dict() for topic in new_topics] + }) + return json_obj, new_topics + + def _get_topic_information_openai(self, topic_idx_lis: list[int]) -> tuple[str, dict]: + """ + A version of the get_topic_information function that returns a JSON file suitable for use with the OpenAI API. + + Args: + topic_idx_lis (list[int]): List of topic indices to compare. + + Returns: + json: JSON object to be used with the OpenAI API. + dict: A dictionary containing detailed information about the specified topics. + """ + + topic_info = self.get_topic_information(topic_idx_lis) + json_obj = json.dumps({ + "topic info": topic_info + }) + return json_obj, topic_info + + def _fix_dictionary_topwords(self): + """ + Fix an issue with the topic representation where the topwords are nested within another dictionary in the actual dictionary defining them. + """ + + for topic in self.topic_lis: + if type(topic.top_words["cosine_similarity"]) == dict: + topic.top_words["cosine_similarity"] = topic.top_words["cosine_similarity"][0] + + def general_prompt(self, prompt: str, n_tries: int = 2) -> tuple[list[str], object]: + """ + Prompt the Language Model (LLM) with a general prompt and return the response. Allow the LLM to call any function defined in the class. + + Use n_tries in case the LLM does not provide a valid response. + + Args: + prompt (str): Prompt string. + n_tries (int, optional): Number of tries to get a valid response from the LLM (default is 2). + + Returns: + list of str: Response messages from the LLM. + object: Response of the invoked function. + """ + + messages = [ + { + "role": "system", + "content": self.basic_model_instruction + " " + self.corpus_instruction + }, + { + "role": "user", + "content": prompt + } + ] + + functions = [self.function_descriptions[key] for key in self.function_descriptions.keys()] + for _ in range(n_tries): + try: + response_message = self.client.chat.completions.create(model = self.openai_prompting_model, + messages = messages, + functions = functions, + function_call = "auto").choices[0].message + + # Step 2: check if GPT wanted to call a function + function_call = response_message.function_call + if function_call is not None: + print("GPT wants to the call the function: ", function_call) + # Step 3: call the function + # Note: the JSON response may not always be valid; be sure to handle errors + + function_name = function_call.name + function_to_call = self.functionNames2Functions[function_name] + function_args = json.loads(function_call.arguments) + function_response = function_to_call(**function_args) + function_response_json = function_response[0] + function_response_return_output = function_response[1] + + # Step 4: send the info on the function call and function response to GPT + messages.append(response_message) # extend conversation with assistant's reply + + messages.append( + { + "role": "function", + "name": function_name, + "content": function_response_json, + } + ) # extend conversation with function response + + second_response = self.client.chat.completions.create(model=self.openai_prompting_model, + messages=messages) # get a new response from GPT where it can see the function response + except (TypeError, ValueError, openai.APIError, openai.APIConnectionError) as error: + print("Error occured: ", error) + print("Trying again...") + + return [response_message, second_response], function_response_return_output \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/src/topicgpt/TopicRepresentation.py b/LLMTopicDetection_TopicGPT/src/topicgpt/TopicRepresentation.py new file mode 100644 index 0000000..89b152a --- /dev/null +++ b/LLMTopicDetection_TopicGPT/src/topicgpt/TopicRepresentation.py @@ -0,0 +1,664 @@ +import numpy as np +import umap +import sys +import os +import inspect +from tqdm import tqdm +import umap +import json + +# make sure the import works even if the package has not been installed and just the files are used + +from topicgpt.Clustering import Clustering_and_DimRed +from topicgpt.ExtractTopWords import ExtractTopWords +from topicgpt.TopwordEnhancement import TopwordEnhancement + +class Topic: + """ + class to represent a topic and all its attributes + """ + + def __init__(self, + topic_idx: str, + documents: list[str], + words: dict[str, int], + centroid_hd: np.ndarray = None, + centroid_ld: np.ndarray = None, + document_embeddings_hd: np.ndarray = None, + document_embeddings_ld: np.ndarray = None, + document_embedding_similarity: np.ndarray = None, + umap_mapper: umap.UMAP = None, + top_words: dict[str, list[str]] = None, + top_word_scores: dict[str, list[float]] = None + ) -> None: + """ + Represents a topic and all its attributes. + + Args: + topic_idx (str): Index or name of the topic. + documents (list[str]): List of documents in the topic. + words (dict[str, int]): Dictionary of words and their counts in the topic. + centroid_hd (np.ndarray, optional): Centroid of the topic in high-dimensional space. + centroid_ld (np.ndarray, optional): Centroid of the topic in low-dimensional space. + document_embeddings_hd (np.ndarray, optional): Embeddings of documents in high-dimensional space that belong to this topic. + document_embeddings_ld (np.ndarray, optional): Embeddings of documents in low-dimensional space that belong to this topic. + document_embedding_similarity (np.ndarray, optional): Similarity array of document embeddings to the centroid in low-dimensional space. + umap_mapper (umap.UMAP, optional): UMAP mapper object to map from high-dimensional space to low-dimensional space. + top_words (dict[str, list[str]], optional): Dictionary of top words in the topic according to different metrics. + top_word_scores (dict[str, list[float]], optional): Dictionary of how representative the top words are according to different metrics. + """ + + # do some checks on the input + + assert len(documents) == len(document_embeddings_hd) == len(document_embeddings_ld) == len(document_embedding_similarity), "documents, document_embeddings_hd, document_embeddings_ld and document_embedding_similarity must have the same length" + assert len(documents) > 0, "documents must not be empty" + assert len(words) > 0, "words must not be empty" + + + self.topic_idx = topic_idx + self.documents = documents + self.words = words + self.centroid_hd = centroid_hd + self.centroid_ld = centroid_ld + self.document_embeddings_hd = document_embeddings_hd + self.document_embeddings_ld = document_embeddings_ld + self.document_embedding_similarity = document_embedding_similarity + self.umap_mapper = umap_mapper + self.top_words = top_words + self.top_word_scores = top_word_scores + + self.topic_name = None # initialize the name of the topic as none + + def __str__(self) -> str: + + if self.topic_idx and self.topic_name is None: + repr = f"Topic {hash(self)}\n" + if self.topic_name is None: + repr = f"Topic: {self.topic_idx}\n" + else: + repr = f"Topic {self.topic_idx}: {self.topic_name}\n" + + return repr + + def __repr__(self) -> str: + return self.__str__() + + def to_json(self) -> str: + """ + return a json representation of the topic + """ + repr_dict = { + "topic_idx": self.topic_idx, + "topic_name": self.topic_name, + "topic_description": self.topic_description + } + + json_object = json.dumps(repr_dict, indent = 4) + return json_object + + def to_dict(self) -> dict: + """ + return a dict representation of the topic + """ + repr_dict = { + "topic_idx": int(self.topic_idx), + "topic_name": self.topic_name, + "topic_description": self.topic_description + } + return repr_dict + + def set_topic_name(self, name:str): + """ + add a name to the topic + params: + name: name of the topic + """ + self.topic_name = name + + def set_topic_description(self, text: str): + """ + add a text description to the topic + params: + text: text description of the topic + """ + self.topic_description = text + +def topic_to_json(topic: Topic) -> str: + """ + Return a JSON representation of the topic. + + Args: + topic (Topic): The topic object to convert to JSON. + + Returns: + str: A JSON string representing the topic. + """ + repr_dict = { + "topic_idx": topic.topic_idx, + "topic_name": topic.topic_name, + "topic_description": topic.topic_description + } + + json_object = json.dumps(repr_dict, indent = 4) + return json_object + +def topic_lis_to_json(topics: list[Topic]) -> str: + """ + Return a JSON representation of a list of topics. + + Args: + topics (list[Topic]): The list of topic objects to convert to JSON. + + Returns: + str: A JSON string representing the list of topics. + """ + repr_dict = {} + for topic in topics: + repr_dict[topic.topic_idx] = { + "topic_name": topic.topic_name, + "topic_description": topic.topic_description + } + + json_object = json.dumps(repr_dict, indent = 4) + return json_object + +@staticmethod +def extract_topics(corpus: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], compute_vocab_hyperparams: dict = {}) -> list[Topic]: + """ + Extracts topics from the given corpus using the provided clusterer object on the document embeddings. + + Args: + corpus (list[str]): List of documents. + document_embeddings (np.ndarray): Embeddings of the documents. + clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents. + vocab_embeddings (np.ndarray): Embeddings of the vocabulary. + n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000). + topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics. + Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]). + compute_vocab_hyperparams (dict, optional): Hyperparameters for the top-word extraction methods. + + Returns: + list[Topic]: List of Topic objects representing the extracted topics. + """ + + for elem in topword_extraction_methods: + if elem not in ["tfidf", "cosine_similarity"]: + raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'") + if topword_extraction_methods == []: + raise ValueError("topword_extraction_methods cannot be empty") + + dim_red_embeddings, labels, umap_mapper = clusterer.cluster_and_reduce(document_embeddings) # get dimensionality reduced embeddings, their labels and the umap mapper object + + unique_labels = np.unique(labels) # In case the cluster labels are not consecutive numbers, we need to map them to consecutive + label_mapping = {label: i for i, label in enumerate(unique_labels[unique_labels != -1])} + label_mapping[-1] = -1 + labels = np.array([label_mapping[label] for label in labels]) + + extractor = ExtractTopWords() + centroid_dict = extractor.extract_centroids(document_embeddings, labels) # get the centroids of the clusters + centroid_arr = np.array(list(centroid_dict.values())) + if centroid_arr.ndim == 1: + centroid_arr = centroid_arr.reshape(-1, 1) + dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space + + dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)} + + vocab = extractor.compute_corpus_vocab(corpus, **compute_vocab_hyperparams) # compute the vocabulary of the corpus + + word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = False) # compute the word-topic matrix of the corpus + if "tfidf" in topword_extraction_methods: + tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf + if "cosine_similarity" in topword_extraction_methods: + cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False) + + topics = [] + for i, label in enumerate(np.unique(labels)): + if label < -0.5: # dont include outliers + continue + topic_idx = f"{label}" + documents = [doc for j, doc in enumerate(corpus) if labels[j] == label] + embeddings_hd = document_embeddings[labels == label] + embeddings_ld = dim_red_embeddings[labels == label] + centroid_hd = centroid_dict[label] + centroid_ld = dim_red_centroids[label] + + centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld)) + similarity_sorting = np.argsort(centroid_similarity)[::-1] + documents = [documents[i] for i in similarity_sorting] + embeddings_hd = embeddings_hd[similarity_sorting] + embeddings_ld = embeddings_ld[similarity_sorting] + + if type(cosine_topwords[label]) == dict: + cosine_topwords[label] = cosine_topwords[label][0] + + top_words = { + "tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None, + "cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None + } + top_word_scores = { + "tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None, + "cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None + } + + topic = Topic(topic_idx = topic_idx, + documents = documents, + words = vocab, + centroid_hd = centroid_hd, + centroid_ld = centroid_ld, + document_embeddings_hd = embeddings_hd, + document_embeddings_ld = embeddings_ld, + document_embedding_similarity = centroid_similarity, + umap_mapper = umap_mapper, + top_words = top_words, + top_word_scores = top_word_scores + ) + + topics.append(topic) + + return topics + +@staticmethod +def extract_topics_no_new_vocab_computation(corpus: list[str], vocab: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], consider_outliers: bool = False) -> list[Topic]: + """ + Extracts topics from the given corpus using the provided clusterer object on the document embeddings. + This version does not compute the vocabulary of the corpus and instead uses the provided vocabulary. + + Args: + corpus (list[str]): List of documents. + vocab (list[str]): Vocabulary of the corpus. + document_embeddings (np.ndarray): Embeddings of the documents. + clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents. + vocab_embeddings (np.ndarray): Embeddings of the vocabulary. + n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000). + topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics. + Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]). + consider_outliers (bool, optional): Whether to consider outliers during topic extraction (default is False). + + Returns: + list[Topic]: List of Topic objects representing the extracted topics. + """ + + + for elem in topword_extraction_methods: + if elem not in ["tfidf", "cosine_similarity"]: + raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'") + if topword_extraction_methods == []: + raise ValueError("topword_extraction_methods cannot be empty") + + dim_red_embeddings, labels, umap_mapper = clusterer.cluster_and_reduce(document_embeddings) # get dimensionality reduced embeddings, their labels and the umap mapper object + + unique_labels = np.unique(labels) # In case the cluster labels are not consecutive numbers, we need to map them to consecutive + label_mapping = {label: i for i, label in enumerate(unique_labels[unique_labels != -1])} + label_mapping[-1] = -1 + labels = np.array([label_mapping[label] for label in labels]) + + extractor = ExtractTopWords() + centroid_dict = extractor.extract_centroids(document_embeddings, labels) # get the centroids of the clusters + + centroid_arr = np.array(list(centroid_dict.values())) + if centroid_arr.ndim == 1: + centroid_arr = centroid_arr.reshape(-1, 1) + dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space + + dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)} + + word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = consider_outliers) # compute the word-topic matrix of the corpus + if "tfidf" in topword_extraction_methods: + tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf + if "cosine_similarity" in topword_extraction_methods: + cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = True) + + topics = [] + for i, label in enumerate(np.unique(labels)): + if label < -0.5: # dont include outliers + continue + topic_idx = f"{label}" + documents = [doc for j, doc in enumerate(corpus) if labels[j] == label] + embeddings_hd = document_embeddings[labels == label] + embeddings_ld = dim_red_embeddings[labels == label] + centroid_hd = centroid_dict[label] + centroid_ld = dim_red_centroids[label] + + centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld)) + similarity_sorting = np.argsort(centroid_similarity)[::-1] + documents = [documents[i] for i in similarity_sorting] + embeddings_hd = embeddings_hd[similarity_sorting] + embeddings_ld = embeddings_ld[similarity_sorting] + + try: + if type(cosine_topwords[label]) == dict: + cosine_topwords[label] = cosine_topwords[label][0] + except: + pass + + top_words = { + "tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None, + "cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None + } + top_word_scores = { + "tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None, + "cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None + } + + topic = Topic(topic_idx = topic_idx, + documents = documents, + words = vocab, + centroid_hd = centroid_hd, + centroid_ld = centroid_ld, + document_embeddings_hd = embeddings_hd, + document_embeddings_ld = embeddings_ld, + document_embedding_similarity = centroid_similarity, + umap_mapper = umap_mapper, + top_words = top_words, + top_word_scores = top_word_scores + ) + + topics.append(topic) + + return topics + +@staticmethod +def extract_and_describe_topics(corpus: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, enhancer: TopwordEnhancement, n_topwords: int = 2000, n_topwords_description: int = 500, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], compute_vocab_hyperparams: dict = {}, topword_description_method: str = "cosine_similarity") -> list[Topic]: + """ + Extracts topics from the given corpus using the provided clusterer object on the document embeddings and describes/names them using the given enhancer object. + + Args: + corpus (list[str]): List of documents. + document_embeddings (np.ndarray): Embeddings of the documents. + clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents. + vocab_embeddings (np.ndarray): Embeddings of the vocabulary. + enhancer (TopwordEnhancement): Enhancer object for enhancing top-words and generating descriptions/names for topics. + n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000). + n_topwords_description (int, optional): Number of top-words to use from the extracted topics for description and naming (default is 500). + topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics. + Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]). + compute_vocab_hyperparams (dict, optional): Hyperparameters for the top-word extraction methods. + topword_description_method (str, optional): Method to use for top-word extraction for description/naming. + Can be "tfidf" or "cosine_similarity" (default is "cosine_similarity"). + + Returns: + list[Topic]: List of Topic objects representing the extracted and described topics. + """ + + print("Extracting topics...") + topics = extract_topics(corpus, document_embeddings, clusterer, vocab_embeddings, n_topwords, topword_extraction_methods, compute_vocab_hyperparams) + print("Describing topics...") + topics = describe_and_name_topics(topics, enhancer, topword_description_method, n_topwords_description) + return topics + +@staticmethod +def extract_topics_labels_vocab(corpus: list[str], document_embeddings_hd: np.ndarray, document_embeddings_ld: np.ndarray, labels: np.ndarray, umap_mapper: umap.UMAP, vocab_embeddings: np.ndarray, vocab: list[str] = None, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"]) -> list[Topic]: + """ + Extracts topics from the given corpus using the provided labels that indicate the topics (no -1 for outliers). Vocabulary is already computed. + + Args: + corpus (list[str]): List of documents. + document_embeddings_hd (np.ndarray): Embeddings of the documents in high-dimensional space. + document_embeddings_ld (np.ndarray): Embeddings of the documents in low-dimensional space. + labels (np.ndarray): Labels indicating the topics. + umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space. + vocab_embeddings (np.ndarray): Embeddings of the vocabulary. + vocab (list[str], optional): Vocabulary of the corpus (default is None). + n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000). + topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics. + Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]). + + Returns: + list[Topic]: List of Topic objects representing the extracted topics. + """ + + for elem in topword_extraction_methods: + if elem not in ["tfidf", "cosine_similarity"]: + raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'") + if topword_extraction_methods == []: + raise ValueError("topword_extraction_methods cannot be empty") + + if vocab is None: + extractor = ExtractTopWords() + vocab = extractor.compute_corpus_vocab(corpus) # compute the vocabulary of the corpus + + extractor = ExtractTopWords() + centroid_dict = extractor.extract_centroids(document_embeddings_hd, labels) # get the centroids of the clusters + + centroid_arr = np.array(list(centroid_dict.values())) + if centroid_arr.ndim == 1: + centroid_arr = centroid_arr.reshape(-1, 1) + dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space + + word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = False) # compute the word-topic matrix of the corpus + + dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)} + + if "tfidf" in topword_extraction_methods: + tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf + if "cosine_similarity" in topword_extraction_methods: + cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False) + + topics = [] + for i, label in enumerate(np.unique(labels)): + if label < -0.5: # dont include outliers + continue + topic_idx = f"{label}" + documents = [doc for j, doc in enumerate(corpus) if labels[j] == label] + embeddings_hd = document_embeddings_hd[labels == label] + embeddings_ld = document_embeddings_ld[labels == label] + centroid_hd = centroid_dict[label] + centroid_ld = dim_red_centroids[label] + + centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld)) + similarity_sorting = np.argsort(centroid_similarity)[::-1] + documents = [documents[i] for i in similarity_sorting] + embeddings_hd = embeddings_hd[similarity_sorting] + embeddings_ld = embeddings_ld[similarity_sorting] + + if type(cosine_topwords[label]) == dict: + cosine_topwords[label] = cosine_topwords[label][0] + top_words = { + "tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None, + "cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None + } + top_word_scores = { + "tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None, + "cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None + } + + topic = Topic(topic_idx = topic_idx, + documents = documents, + words = vocab, + centroid_hd = centroid_hd, + centroid_ld = centroid_ld, + document_embeddings_hd = embeddings_hd, + document_embeddings_ld = embeddings_ld, + document_embedding_similarity = centroid_similarity, + umap_mapper = umap_mapper, + top_words = top_words, + top_word_scores = top_word_scores + ) + + topics.append(topic) + + return topics + +@staticmethod +def extract_describe_topics_labels_vocab( + corpus: list[str], + document_embeddings_hd: np.ndarray, + document_embeddings_ld: np.ndarray, + labels: np.ndarray, + umap_mapper: umap.UMAP, + vocab_embeddings: np.ndarray, + enhancer: TopwordEnhancement, + vocab: list[str] = None, + n_topwords: int = 2000, + n_topwords_description: int = 500, + topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], + topword_description_method: str = "cosine_similarity" +) -> list[Topic]: + """ + Extracts topics from the given corpus using the provided labels that indicate the topics (no -1 for outliers). Vocabulary is already computed. + Describe and name the topics with the given enhancer object. + + Args: + corpus (list[str]): List of documents. + document_embeddings_hd (np.ndarray): Embeddings of the documents in high-dimensional space. + document_embeddings_ld (np.ndarray): Embeddings of the documents in low-dimensional space. + labels (np.ndarray): Labels indicating the topics. + umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space. + vocab_embeddings (np.ndarray): Embeddings of the vocabulary. + enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description. + vocab (list[str], optional): Vocabulary of the corpus (default is None). + n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000). + n_topwords_description (int, optional): Number of top-words to use from the extracted topics for the description and the name (default is 500). + topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics. + Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]). + topword_description_method (str, optional): Method to use for top-word extraction. Can be "tfidf" or "cosine_similarity" (default is "cosine_similarity"). + + Returns: + list[Topic]: List of Topic objects representing the extracted topics. + """ + + topics = extract_topics_labels_vocab(corpus, document_embeddings_hd, document_embeddings_ld, labels, umap_mapper, vocab_embeddings, vocab, n_topwords, topword_extraction_methods) + topics = describe_and_name_topics(topics, enhancer, topword_description_method, n_topwords_description) + return topics + +@staticmethod +def extract_topic_cos_sim( + documents_topic: list[str], + document_embeddings_topic: np.ndarray, + words_topic: list[str], + vocab_embeddings: dict, + umap_mapper: umap.UMAP, + n_topwords: int = 2000 +) -> Topic: + """ + Create a Topic object from the given documents and embeddings by computing the centroid and the top-words. + Only uses cosine-similarity for top-word extraction. + + Args: + documents_topic (list[str]): List of documents in the topic. + document_embeddings_topic (np.ndarray): High-dimensional embeddings of the documents in the topic. + words_topic (list[str]): List of words in the topic. + vocab_embeddings (dict): Embeddings of the vocabulary. + umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space. + n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000). + + Returns: + Topic: Topic object representing the extracted topic. + """ + + topword_extraction_methods = ["cosine_similarity"] + extractor = ExtractTopWords() + centroid_hd = extractor.extract_centroid(document_embeddings_topic) + centroid_ld = umap_mapper.transform(centroid_hd.reshape(1, -1))[0] + + labels = np.zeros(len(documents_topic), dtype = int) #everything has label 0 + + word_topic_mat = extractor.compute_word_topic_mat(documents_topic, words_topic, labels, consider_outliers = False) # compute the word-topic matrix of the corpus + if "cosine_similarity" in topword_extraction_methods: + cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = words_topic, vocab_embedding_dict = vocab_embeddings, centroid_dict= {0: centroid_ld}, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False) + + + + top_words = { + "cosine_similarity": cosine_topwords if "cosine_similarity" in topword_extraction_methods else None + } + top_word_scores = { + "cosine_similarity": cosine_dict if "cosine_similarity" in topword_extraction_methods else None + } + + document_embeddings_hd = document_embeddings_topic + document_embeddings_ld = umap_mapper.transform(document_embeddings_hd) + document_embedding_similarity = np.dot(document_embeddings_ld, centroid_ld)/(np.linalg.norm(document_embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld)) # is this correct??? + + topic = Topic(topic_idx = None, + documents = documents_topic, + words = words_topic, + centroid_hd = centroid_hd, + centroid_ld = centroid_ld, + document_embeddings_hd = document_embeddings_hd, + document_embeddings_ld = document_embeddings_ld, + document_embedding_similarity = document_embedding_similarity, + umap_mapper = umap_mapper, + top_words = top_words, + top_word_scores = top_word_scores + ) + + return topic + +@staticmethod +def extract_and_describe_topic_cos_sim( + documents_topic: list[str], + document_embeddings_topic: np.ndarray, + words_topic: list[str], + vocab_embeddings: dict, + umap_mapper: umap.UMAP, + enhancer: TopwordEnhancement, + n_topwords: int = 2000, + n_topwords_description=500 +) -> Topic: + """ + Create a Topic object from the given documents and embeddings by computing the centroid and the top-words. + Only use cosine-similarity for top-word extraction. + Describe and name the topic with the given enhancer object. + + Args: + documents_topic (list[str]): List of documents in the topic. + document_embeddings_topic (np.ndarray): High-dimensional embeddings of the documents in the topic. + words_topic (list[str]): List of words in the topic. + vocab_embeddings (dict): Embeddings of the vocabulary. + umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space. + enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description. + n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000). + n_topwords_description (int, optional): Number of top-words to use from the extracted topics for the description and the name (default is 500). + + Returns: + Topic: Topic object representing the extracted and described topic. + """ + topic = extract_topic_cos_sim(documents_topic, document_embeddings_topic, words_topic, vocab_embeddings, umap_mapper, n_topwords) + topic = describe_and_name_topics([topic], enhancer, "cosine_similarity", n_topwords_description)[0] + return topic + + topic = extract_topic_cos_sim(documents_topic, document_embeddings_topic, words_topic, vocab_embeddings, umap_mapper, n_topwords) + topic = describe_and_name_topics([topic], enhancer, "cosine_similarity", n_topwords_description)[0] + return topic + +@staticmethod +def describe_and_name_topics( + topics: list[Topic], + enhancer: TopwordEnhancement, + topword_method="tfidf", + n_words=500 +) -> list[Topic]: + """ + Describe and name the topics using the OpenAI API with the given enhancer object. + + Args: + topics (list[Topic]): List of Topic objects. + enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description. + topword_method (str, optional): Method to use for top-word extraction. Can be "tfidf" or "cosine_similarity" (default is "tfidf"). + n_words (int, optional): Number of topwords to extract for the description and the name (default is 500). + + Returns: + list[Topic]: List of Topic objects with the description and name added. + """ + + if topword_method not in ["tfidf", "cosine_similarity"]: + raise ValueError("topword_method can only be 'tfidf' or 'cosine_similarity'") + + for topic in tqdm(topics): + tws = topic.top_words[topword_method] + try: + topic_name = enhancer.generate_topic_name_str(tws, n_words = n_words) + topic_description = enhancer.describe_topic_topwords_str(tws, n_words = n_words) + except Exception as e: + print(f"Error in topic {topic.topic_idx}: {e}") + print("Trying again...") + topic_name = enhancer.generate_topic_name_str(tws, n_words = n_words) + topic_description = enhancer.describe_topic_topwords_str(tws, n_words = n_words) + + + topic.set_topic_name(topic_name) + topic.set_topic_description(topic_description) + + return topics + diff --git a/LLMTopicDetection_TopicGPT/src/topicgpt/TopwordEnhancement.py b/LLMTopicDetection_TopicGPT/src/topicgpt/TopwordEnhancement.py new file mode 100644 index 0000000..e5873fb --- /dev/null +++ b/LLMTopicDetection_TopicGPT/src/topicgpt/TopwordEnhancement.py @@ -0,0 +1,306 @@ +import tiktoken +from openai import OpenAI + + +from typing import Callable +import numpy as np + +basic_instruction = "You are a helpful assistant. You are excellent at inferring topics from top-words extracted via topic-modelling. You make sure that everything you output is strictly based on the provided text." + +class TopwordEnhancement: + + def __init__( + self, + client, + openai_model: str = "gpt-3.5-turbo", + max_context_length: int = 4000, + openai_model_temperature: float = 0.5, + basic_model_instruction: str = basic_instruction, + corpus_instruction: str = "") -> None: + """ + Initialize the OpenAIAssistant with the specified parameters. + + Args: + client: Client. + openai_model (str, optional): The OpenAI model to use (default is "gpt-3.5-turbo"). + max_context_length (int, optional): The maximum length of the context for the OpenAI model (default is 4000). + openai_model_temperature (float, optional): The softmax temperature to use for the OpenAI model (default is 0.5). + basic_model_instruction (str, optional): The basic instruction for the model. + corpus_instruction (str, optional): The instruction for the corpus. Useful if specific information on the corpus is available. + + Returns: + None + """ + + # do some checks on the input arguments + assert openai_model is not None, "Please provide an openai model" + assert max_context_length > 0, "Please provide a positive max_context_length" + assert openai_model_temperature > 0, "Please provide a positive openai_model_temperature" + + self.client = client + self.openai_model = openai_model + self.max_context_length = max_context_length + self.openai_model_temperature = openai_model_temperature + self.basic_model_instruction = basic_model_instruction + self.corpus_instruction = f" The following information is available about the corpus used to identify the topics: {corpus_instruction}" + + def __str__(self) -> str: + repr = f"TopwordEnhancement(openai_model = {self.openai_model})" + return repr + + def __repr__(self) -> str: + repr = f"TopwordEnhancement(openai_model = {self.openai_model})" + return repr + + def count_tokens_api_message(self, messages: list[dict[str]]) -> int: + """ + Count the number of tokens in the API messages. + + Args: + messages (list[dict[str]]): List of messages from the API. + + Returns: + int: Number of tokens in the messages. + """ + encoding = tiktoken.encoding_for_model(self.openai_model) + n_tokens = 0 + for message in messages: + for key, value in message.items(): + if key == "content": + n_tokens += len(encoding.encode(value)) + + return n_tokens + + def describe_topic_topwords_completion_object(self, + topwords: list[str], + n_words: int = None, + query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Also describe the various aspects and sub-topics of the topic.") : + """ + Describe the given topic based on its topwords using the OpenAI model. + + Args: + topwords (list[str]): List of topwords. + n_words (int, optional): Number of words to use for the query. If None, all words are used. + query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string. + + Returns: + openai.ChatCompletion: A description of the topics by the model in the form of an OpenAI ChatCompletion object. + """ + + if n_words is None: + n_words = len(topwords) + + if type(topwords) == dict: + topwords = topwords[0] + + topwords = topwords[:n_words] + topwords = np.array(topwords) + + + # if too many topwords are given, use only the first part of the topwords that fits into the context length + tokens_cumsum = np.cumsum([len(tiktoken.encoding_for_model(self.openai_model).encode(tw + ", ")) for tw in topwords]) + len(tiktoken.encoding_for_model(self.openai_model).encode(self.basic_model_instruction + " " + self.corpus_instruction)) + if tokens_cumsum[-1] > self.max_context_length: + print("Too many topwords given. Using only the first part of the topwords that fits into the context length. Number of topwords used: ", np.argmax(tokens_cumsum > self.max_context_length)) + n_words = np.argmax(tokens_cumsum > self.max_context_length) + topwords = topwords[:n_words] + + + + completion = self.client.chat.completions.create(model=self.openai_model, + messages=[ + {"role": "system", "content": self.basic_model_instruction + " " + self.corpus_instruction}, + {"role": "user", "content": query_function(topwords)}, + ], + temperature = self.openai_model_temperature) + + return completion + + def describe_topic_topwords_str(self, + topwords: list[str], + n_words: int = None, + query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Also describe the various aspects and sub-topics of the topic. Make sure the descriptions are short and concise! Do not cite more than 5 words per sub-aspect!!!") -> str: + """ + Describe the given topic based on its topwords using the OpenAI model. + + Args: + topwords (list[str]): List of topwords. + n_words (int, optional): Number of words to use for the query. If None, all words are used. + query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string. + + Returns: + str: A description of the topics by the model in the form of a string. + """ + + completion = self.describe_topic_topwords_completion_object(topwords, n_words, query_function) + return completion.choices[0].message.content + + def generate_topic_name_str(self, + topwords: list[str], + n_words: int = None, + query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Give me only the title of the topic and nothing else please. Make sure the title is precise and not longer than 5 words, ideally even shorter.") -> str: + """ + Generate a topic name based on the given topwords using the OpenAI model. + + Args: + topwords (list[str]): List of topwords. + n_words (int, optional): Number of words to use for the query. If None, all words are used. + query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string. + + Returns: + str: A topic name generated by the model in the form of a string. + """ + + return self.describe_topic_topwords_str(topwords, n_words, query_function) + + def describe_topic_documents_completion_object(self, + documents: list[str], + truncate_doc_thresh=100, + n_documents: int = None, + query_function: Callable = lambda docs: f"Please give me the common topic of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic."): + """ + Describe the given topic based on its documents using the OpenAI model. + + Args: + documents (list[str]): List of documents. + truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document has more words than this threshold, it is pruned to this threshold. + n_documents (int, optional): Number of documents to use for the query. If None, all documents are used. + query_function (Callable, optional): Function to query the model. The function should take a list of documents and return a string. + + Returns: + openai.ChatCompletion: A description of the topics by the model in the form of an openai.ChatCompletion object. + """ + + if n_documents is None: + n_documents = len(documents) + documents = documents[:n_documents] + + # prune documents based on number of tokens they contain + new_doc_lis = [] + for doc in documents: + doc = doc.split(" ") + if len(doc) > truncate_doc_thresh: + doc = doc[:truncate_doc_thresh] + new_doc_lis.append(" ".join(doc)) + documents = new_doc_lis + + # if too many documents are given, use only the first part of the documents that fits into the context length + tokens_cumsum = np.cumsum([len(tiktoken.encoding_for_model(self.openai_model).encode(doc + ", ")) for doc in documents]) + len(tiktoken.encoding_for_model(self.openai_model).encode(self.basic_model_instruction + " " + self.corpus_instruction)) + if tokens_cumsum[-1] > self.max_context_length: + print("Too many documents given. Using only the first part of the documents that fits into the context length. Number of documents used: ", np.argmax(tokens_cumsum > self.max_context_length)) + n_documents = np.argmax(tokens_cumsum > self.max_context_length) + documents = documents[:n_documents] + + completion = self.client.chat.completions.create(model=self.openai_model, + messages=[ + {"role": "system", "content": self.basic_model_instruction + " " + self.corpus_instruction}, + {"role": "user", "content": query_function(documents)}, + ], + temperature = self.openai_model_temperature) + + return completion + + + @staticmethod + def sample_identity(n_docs: int) -> np.ndarray: + """ + Generate an identity array of document indices without changing their order. + + Args: + n_docs (int): Number of documents. + + Returns: + np.ndarray: An array containing document indices from 0 to (n_docs - 1). + """ + + return np.arange(n_docs) + + + @staticmethod + def sample_uniform(n_docs: int) -> np.ndarray: + """ + Randomly sample document indices without replacement. + + Args: + n_docs (int): Number of documents. + + Returns: + np.ndarray: An array containing randomly permuted document indices from 0 to (n_docs - 1). + """ + + return np.random.permutation(n_docs) + + @staticmethod + def sample_poisson(n_docs: int) -> np.ndarray: + """ + Randomly sample document indices according to a Poisson distribution, favoring documents from the beginning of the list. + + Args: + n_docs (int): Number of documents. + + Returns: + np.ndarray: An array containing randomly permuted document indices, with more documents drawn from the beginning of the list. + """ + + return np.random.poisson(1, n_docs) + + def describe_topic_documents_sampling_completion_object( + self, + documents: list[str], + truncate_doc_thresh=100, + n_documents: int = None, + query_function: Callable = lambda docs: f"Please give me the common topic of the sample of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic.", + sampling_strategy: str = None,): + """ + Describe a topic based on a sample of its documents by using the openai model. + + Args: + documents (list[str]): List of documents ordered by similarity to the topic's centroid. + truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document exceeds this threshold, it is truncated. Defaults to 100. + n_documents (int, optional): Number of documents to use for the query. If None, all documents are used. Defaults to None. + query_function (Callable, optional): Function to query the model. Defaults to a lambda function generating a query based on the provided documents. + sampling_strategy (Union[Callable, str], optional): Strategy to sample the documents. If None, the first provided documents are used. + If it's a string, it's interpreted as a method of the class (e.g., "sample_uniform" is interpreted as self.sample_uniform). It can also be a custom sampling function. Defaults to None. + + Returns: + openai.ChatCompletion: A description of the topic by the model in the form of an openai.ChatCompletion object. + """ + + if type(sampling_strategy) == str: + if sampling_strategy == "topk": + sampling_strategy = self.sample_identity + if sampling_strategy=="identity": + sampling_strategy = self.sample_identity + elif sampling_strategy=="uniform": + sampling_strategy = self.sample_uniform + elif sampling_strategy=="poisson": + sampling_strategy = self.sample_poisson + + new_documents = [documents[i] for i in sampling_strategy(n_documents)] + + result = self.describe_topic_documents_completion_object(new_documents, truncate_doc_thresh, n_documents, query_function) + return result + + def describe_topic_document_sampling_str( + self, + documents: list[str], + truncate_doc_thresh=100, + n_documents: int = None, + query_function: Callable = lambda docs: f"Please give me the common topic of the sample of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic.", + sampling_strategy: str = None,) -> str: + """ + Describe a topic based on a sample of its documents by using the openai model. + + Args: + documents (list[str]): List of documents ordered by similarity to the topic's centroid. + truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document exceeds this threshold, it is truncated. Defaults to 100. + n_documents (int, optional): Number of documents to use for the query. If None, all documents are used. Defaults to None. + query_function (Callable, optional): Function to query the model. Defaults to a lambda function generating a query based on the provided documents. + sampling_strategy (Union[Callable, str], optional): Strategy to sample the documents. If None, the first provided documents are used. + If it's a string, it's interpreted as a method of the class (e.g., "sample_uniform" is interpreted as self.sample_uniform). It can also be a custom sampling function. Defaults to None. + + Returns: + str: A description of the topic by the model in the form of a string. + """ + + completion = self.describe_topic_document_sampling_completion_object(documents, truncate_doc_thresh, n_documents, query_function, sampling_strategy) + return completion.choices[0].message.content \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/src/topicgpt/__init__.py b/LLMTopicDetection_TopicGPT/src/topicgpt/__init__.py new file mode 100644 index 0000000..eead319 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/src/topicgpt/__init__.py @@ -0,0 +1 @@ +__version__ = '0.0.5' diff --git a/LLMTopicDetection_TopicGPT/test/TestPackage/TestTopicGPT_init_and_fit.py b/LLMTopicDetection_TopicGPT/test/TestPackage/TestTopicGPT_init_and_fit.py new file mode 100644 index 0000000..2830888 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/test/TestPackage/TestTopicGPT_init_and_fit.py @@ -0,0 +1,166 @@ +""" +This class tests the init and fit functions of the TopicGPT module. +""" + +import os +import sys +import inspect +import openai +import pickle + +import unittest + +from topicgpt.TopicRepresentation import Topic + +from topicgpt.Clustering import Clustering_and_DimRed +from topicgpt.TopwordEnhancement import TopwordEnhancement +from topicgpt.TopicPrompting import TopicPrompting +from topicgpt.TopicGPT import TopicGPT + +class TestTopicGPT_init_and_fit(unittest.TestCase): + """ + Test the init and fit functions of the TopicGPT class + """ + + @classmethod + def setUpClass(cls, sample_size = 0.5): + """ + load the necessary data and only keep a sample of it + """ + print("Setting up class...") + cls.api_key_openai = os.environ.get('api_key') + # TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.environ.get('OPENAI_ORG'))' + # openai.organization = os.environ.get('OPENAI_ORG') + + with open("../../Data/Emebeddings/embeddings_20ng_raw.pkl", "rb") as f: + data_raw = pickle.load(f) + + corpus = data_raw["corpus"] + doc_embeddings = data_raw["embeddings"] + + n_docs = int(len(corpus) * sample_size) + cls.corpus = corpus[:n_docs] + cls.doc_embeddings = doc_embeddings[:n_docs] + + print("Using {} out of {} documents".format(n_docs, len(data_raw["corpus"]))) + + with open("../../Data/Emebeddings/embeddings_20ng_vocab.pkl", "rb") as f: + cls.embeddings_vocab = pickle.load(f) + + def test_init(self): + """ + test the init function of the TopicGPT class + """ + print("Testing init...") + topicgpt = TopicGPT(api_key = self.api_key_openai) + self.assertTrue(isinstance(topicgpt, TopicGPT)) + + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 20) + self.assertTrue(isinstance(topicgpt, TopicGPT)) + + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 20, + corpus_instruction="This is a corpus instruction", + document_embeddings = self.doc_embeddings, + vocab_embeddings= self.embeddings_vocab) + self.assertTrue(isinstance(topicgpt, TopicGPT)) + + # check if assertions are triggered + + with self.assertRaises(AssertionError): + topicgpt = TopicGPT(api_key = None, + n_topics= 32, + openai_prompting_model="gpt-4", + max_number_of_tokens=8000, + corpus_instruction="This is a corpus instruction") + + with self.assertRaises(AssertionError): + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 0, + max_number_of_tokens=8000, + corpus_instruction="This is a corpus instruction") + + with self.assertRaises(AssertionError): + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 20, + max_number_of_tokens=0, + corpus_instruction="This is a corpus instruction") + + def test_fit(self): + """ + test the fit function of the TopicGPT class + """ + print("Testing fit...") + + def instance_test(topicgpt): + topicgpt.fit(self.corpus) + + self.assertTrue(hasattr(topicgpt, "vocab")) + self.assertTrue(hasattr(topicgpt, "topic_lis")) + + self.assertTrue(isinstance(topicgpt.vocab, list)) + self.assertTrue(isinstance(topicgpt.vocab[0], str)) + + self.assertTrue(isinstance(topicgpt.topic_lis, list)) + self.assertTrue(type(topicgpt.topic_lis[0]) == Topic) + + if topicgpt.n_topics is not None: + self.assertTrue(len(topicgpt.topic_lis) == topicgpt.n_topics) + + self.assertTrue(topicgpt.topic_lis == topicgpt.topic_prompting.topic_lis) + self.assertTrue(topicgpt.vocab == topicgpt.topic_prompting.vocab) + self.assertTrue(topicgpt.vocab_embeddings == topicgpt.topic_prompting.vocab_embeddings) + + + topicgpt1 = TopicGPT(api_key = self.api_key_openai, + n_topics= 20, + document_embeddings = self.doc_embeddings, + vocab_embeddings = self.embeddings_vocab) + + topicgpt2 = TopicGPT(api_key = self.api_key_openai, + n_topics= None, + document_embeddings = self.doc_embeddings, + vocab_embeddings = self.embeddings_vocab) + + topicgpt3 = TopicGPT(api_key=self.api_key_openai, + n_topics = 1, + document_embeddings = self.doc_embeddings, + vocab_embeddings = self.embeddings_vocab, + n_topwords=10, + n_topwords_description=10, + topword_extraction_methods=["cosine_similarity"]) + + clusterer4 = Clustering_and_DimRed( + n_dims_umap = 10, + n_neighbors_umap = 20, + min_cluster_size_hdbscan = 10, + number_clusters_hdbscan= 10 # use only 10 clusters + ) + + topword_enhancement4 = TopwordEnhancement(api_key = self.api_key_openai) + topic_prompting4 = TopicPrompting( + api_key = self.api_key_openai, + enhancer = topword_enhancement4, + topic_lis = None + ) + + topicgpt4 = TopicGPT(api_key=self.api_key_openai, + n_topics= None, + document_embeddings = self.doc_embeddings, + vocab_embeddings = self.embeddings_vocab, + topic_prompting = topic_prompting4, + clusterer = clusterer4, + topword_extraction_methods=["tfidf"]) + + + topic_gpt_list = [topicgpt1, topicgpt2, topicgpt3, topicgpt4] + + for topic_gpt in topic_gpt_list: + instance_test(topic_gpt) + + + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/test/TestPackage/TestTopicGPT_prompting.py b/LLMTopicDetection_TopicGPT/test/TestPackage/TestTopicGPT_prompting.py new file mode 100644 index 0000000..c30ffd6 --- /dev/null +++ b/LLMTopicDetection_TopicGPT/test/TestPackage/TestTopicGPT_prompting.py @@ -0,0 +1,469 @@ +""" +This class is used to mainly test the prompting functionality of the TopicGPT package. +""" + +import os +import sys +import inspect + +import openai +import pickle +import unittest + +from topicgpt.TopicGPT import TopicGPT +from topicgpt.TopicRepresentation import Topic +from topicgpt.Clustering import Clustering_and_DimRed +from topicgpt.TopwordEnhancement import TopwordEnhancement +from topicgpt.TopicPrompting import TopicPrompting + + +# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.environ.get('OPENAI_ORG'))' +# openai.organization = os.environ.get('OPENAI_ORG') + +class TestTopicGPT_prompting(unittest.TestCase): + """ + This class is used to mainly test the prompting functionality of the TopicGPT class. + """ + + @classmethod + def setUp(self): + """ + load the necessary topic prompting object + """ + + print("Setting up class...") + try: + with open("Data/SavedTopicRepresentations/TopicGpt_20ng.pkl", "rb") as f: + self.topicgpt = pickle.load(f) + except FileNotFoundError: + with open("../../Data/SavedTopicRepresentations/TopicGpt_20ng.pkl", "rb") as f: + self.topicgpt = pickle.load(f) + + print(f"The topic list of this object is: \n {self.topicgpt.topic_lis} \n\n") + + def test_visualize_clusters(self): + """ + test the visualize_clusters function of the TopicGPT class + """ + print("Testing visualize_clusters...") + self.topicgpt.visualize_clusters() + + def test_repr_topics(self): + """ + test the repr_topics function of the TopicGPT class + """ + print("Testing repr_topics...") + self.assertTrue(type(self.topicgpt.repr_topics()) == str) + + def test_promt_knn_search(self): + """ + test the ppromt function that calls knn_search of the TopicPrompting class + """ + print("Testing ppromt_knn_search...") + + prompt_lis = ["Is topic 0 about Bananas? Use knn Search", + "Is topic 0 about Space? Use knn Search", + "Is topic 13 about Space exploration? Use knn Search"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result[0]) == list) + self.assertTrue(type(function_result[1]) == list) + self.assertTrue(type(function_result[0][0]) == str) + self.assertTrue(type(function_result[1][0]) == int) + + def test_promt_identify_topic_idx(self): + """ + test the ppromt function that calls identify_topic_idx of the TopicPrompting class + """ + + print("Testing ppromt_identify_topic_idx...") + prompt_lis = ["What is the index of the topic about Space?", + "What is the index of the topic about cars?", + "What is the index of the topic about gun control?"] + correct_indices = [13, 9, 2] + + for prompt, correct_idx in zip(prompt_lis, correct_indices): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == int) + self.assertTrue(function_result == correct_idx) # topic 14 is about space + + def test_prompt_identify_topc_idx_no_index_prompt(self): + """ + test the ppromt function that calls identify_topic_idx of the TopicPrompting class + """ + + print("Testing ppromt_identify_topic_idx...") + no_index_prompt = "What is the index of the topic about bananas?" + + answer, function_result = self.topicgpt.prompt(no_index_prompt) + + print(f"Answer to the prompt '{no_index_prompt}' \n is \n '{answer}'") + self.assertTrue(type(answer) == str) + self.assertTrue(function_result == None) + + def test_prompt_split_topic_kmeans(self): + """ + test the ppromt function that calls split_topic_kmeans of the TopicPrompting class + """ + + print("Testing ppromt_split_topic_kmeans...") + + prompt_lis = ["Split topic 0 into 2 subtopics using kmeans", + "Split topic 1 into 3 subtopics using kmeans", + "Split topic 2 into 4 subtopics using kmeans"] + added_topic_lis_len = [2, 3, 4] + + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == added_topic_len + len(self.topicgpt.topic_lis) -1 ) + + def test_prompt_split_topic_kmeans_inplace(self): + """ + test the ppromt function that calls split_topic_kmeans of the TopicPrompting class + """ + + print("Testing ppromt_split_topic_kmeans...") + + prompt_lis = ["Split topic 0 into 2 subtopics using kmeans. Do this inplace"] + added_topic_lis_len = [2] + + old_number_of_topics = len(self.topicgpt.topic_lis) + + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_of_topics + added_topic_len -1 ) + self.assertTrue(self.topicgpt.topic_lis == function_result) + + def test_prompt_split_topic_hdbscan(self): + """ + test the ppromt function that calls split_topic_hdbscan of the TopicPrompting class + """ + + print("Testing ppromt_split_topic_hdbscan...") + + prompt_lis = ["Split topic 0 into subtopics using hdbscan", + "Split topic 1 into subtopics using hdbscan", + "Split topic 2 into subtopics using hdbscan"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + + def test_prompt_split_topic_hdbscan_inplace(self): + """ + test the ppromt function that calls split_topic_hdbscan of the TopicPrompting class + """ + + print("Testing ppromt_split_topic_hdbscan...") + + prompt_lis = ["Split topic 4 into subtopics using hdbscan. Do this inplace"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + + print("topic_gpt_topic_list: ", self.topicgpt.topic_lis) + print("function_result: ", function_result) + self.assertTrue(self.topicgpt.topic_lis == function_result) + + def test_prompt_split_topic_keywords(self): + """ + test the prompt function that calls split_topic_keywords of the TopicPrompting class. This test works almost the same as the test_prompt_split_topic_kmeans + """ + + print("Testing ppromt_split_topic_keywords...") + + prompt_lis = ["Split topic 0 into 2 subtopics based on the keywords Technology and Computers", + "Split topic 14 into two subbtopics based on the keywords Space and Exploration"] + + added_topic_lis_len = [2, 2] + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + print(type(function_result[0])) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == added_topic_len + len(self.topicgpt.topic_lis) -1 ) + + def test_prompt_split_topic_keywords_inplace(self): + """ + test the prompt function that calls split_topic_keywords of the TopicPrompting class. This test works almost the same as the test_prompt_split_topic_kmeans + """ + + print("Testing ppromt_split_topic_keywords...") + + prompt_lis = ["Split topic 13 into 2 subtopics based on the keywords 'Rocket and 'Milky Way'. Do this inplace"] + + added_topic_lis_len = [2] + old_number_of_topics = len(self.topicgpt.topic_lis) + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_of_topics + added_topic_len - 1) + self.assertTrue(self.topicgpt.topic_lis == function_result) + + def test_prompt_split_topic_single_keyword(self): + """ + test the prompt function that calls split_topic_keywords of the TopicPrompting class. This test works almost the same as the test_prompt_split_topic_kmeans + """ + + print("Testing ppromt_split_topic_keywords...") + + prompt_lis = ["Split topic into two topics using the additional keyword 'Technology'", + "Split topic into two topics using the additional keyword 'Space'"] + + added_topic_lis_len = [2, 2] + + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == added_topic_len + len(self.topicgpt.topic_lis) -1 ) + + def test_prompt_split_topic_single_keyword_inplace(self): + """ + test the prompt function that calls split_topic_keywords of the TopicPrompting class. This test works almost the same as the test_prompt_split_topic_kmeans + """ + + print("Testing ppromt_split_topic_keywords...") + + prompt_lis = ["Split topic 0 into 2 subtopics based on the keywords Technology and Computers. Do this inplace"] + + added_topic_lis_len = [2] + old_number_of_topics = len(self.topicgpt.topic_lis) + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_of_topics + added_topic_len -1 ) + self.assertTrue(self.topicgpt.topic_lis == function_result) + + def test_prompt_combine_topics(self): + """ + test the prompt function that calls combine_topics of the TopicPrompting class + """ + + print("Testing ppromt_combine_topics...") + + prompt_lis = ["Combine topic 0 and topic 1 into one topic", + "Combine topic 1 and topic 2 into one topic", + "Combine topic 2 and topic 3 into one topic"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == len(self.topicgpt.topic_lis) -1) + + def test_prompt_combine_topics_inplace(self): + """ + test the prompt function that calls combine_topics of the TopicPrompting class + """ + + print("Testing ppromt_combine_topics...") + + prompt_lis = ["Combine topic 0 and topic 1 into one topic. Do this inplace"] + old_number_topics = len(self.topicgpt.topic_lis) + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + print("topic_gpt_topic_list: ", self.topicgpt.topic_lis) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(self.topicgpt.topic_lis == function_result) + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_topics -1) + + def test_prompt_add_new_topic_keyword(self): + """ + test the prompt function that calls add_new_topic_keyword of the TopicPrompting class + """ + + print("Testing ppromt_add_new_topic_keyword...") + + prompt_lis = ["Add a new topic with the keyword 'Politics'", + "Add a new topic with the keyword 'Climate Change'", + "Add a new topic with the keyword 'Computers'"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + print(type(function_result[0])) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == len(self.topicgpt.topic_lis) +1) + + def test_prompt_add_new_topic_keyword_inplace(self): + """ + test the prompt function that calls add_new_topic_keyword of the TopicPrompting class + """ + + print("Testing ppromt_add_new_topic_keyword...") + + prompt_lis = ["Add a new topic with the keyword 'Politics'. Do this inplace"] + old_number_topics = len(self.topicgpt.topic_lis) + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(self.topicgpt.topic_lis == function_result) + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_topics +1) + + def test_prompt_delete_topic(self): + """ + test the prompt function that calls delete_topic of the TopicPrompting class + """ + + print("Testing ppromt_delete_topic...") + + prompt_lis = ["Delete topic 0", + "Delete topic 1", + "Delete topic 2"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == len(self.topicgpt.topic_lis) -1) + + def test_prompt_delete_topic_inplace(self): + """ + test the prompt function that calls delete_topic of the TopicPrompting class + """ + + print("Testing ppromt_delete_topic...") + + prompt_lis = ["Delete topic 0. Do this inplace"] + old_number_topics = len(self.topicgpt.topic_lis) + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(self.topicgpt.topic_lis == function_result) + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_topics -1) + + def test_prompt_get_topic_information(self): + """ + test the get_topic_information function of the TopicGPT class + """ + + print("Testing get_topic_information...") + + prompt_lis = ["Please compare topic 0 and topic 1", + "Please compare topic 3,4,5"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == dict) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/test/TestTopicGPT_init_and_fit.py b/LLMTopicDetection_TopicGPT/test/TestTopicGPT_init_and_fit.py new file mode 100644 index 0000000..ec4df3f --- /dev/null +++ b/LLMTopicDetection_TopicGPT/test/TestTopicGPT_init_and_fit.py @@ -0,0 +1,178 @@ +""" +This class is used to test the init and fit functions of the TopicGPT class +""" + +import os +import sys +import inspect +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentdir = os.path.dirname(currentdir) + +sys.path.insert(0, f"{parentdir}/src") +from topicgpt.TopicGPT import TopicGPT + +sys.path.insert(0, parentdir) + +import openai +import pickle + +import unittest + +from src.topicgpt.TopicRepresentation import Topic + +from src.topicgpt.Clustering import Clustering_and_DimRed +from src.topicgpt.TopwordEnhancement import TopwordEnhancement +from src.topicgpt.TopicPrompting import TopicPrompting + +class TestTopicGPT_init_and_fit(unittest.TestCase): + """ + Test the init and fit functions of the TopicGPT class + """ + + @classmethod + def setUpClass(cls, sample_size = 0.1): + """ + load the necessary data and only keep a sample of it + """ + print("Setting up class...") + cls.api_key_openai = os.environ.get('api_key') + # TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.environ.get('OPENAI_ORG'))' + # openai.organization = os.environ.get('OPENAI_ORG') + + with open("Data/Emebeddings/embeddings_20ng_raw.pkl", "rb") as f: + data_raw = pickle.load(f) + + corpus = data_raw["corpus"] + doc_embeddings = data_raw["embeddings"] + + n_docs = int(len(corpus) * sample_size) + cls.corpus = corpus[:n_docs] + cls.doc_embeddings = doc_embeddings[:n_docs] + + print("Using {} out of {} documents".format(n_docs, len(data_raw["corpus"]))) + + with open("Data/Emebeddings/embeddings_20ng_vocab.pkl", "rb") as f: + cls.embeddings_vocab = pickle.load(f) + + def test_init(self): + """ + test the init function of the TopicGPT class + """ + print("Testing init...") + topicgpt = TopicGPT(api_key = self.api_key_openai) + self.assertTrue(isinstance(topicgpt, TopicGPT)) + + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 20) + self.assertTrue(isinstance(topicgpt, TopicGPT)) + + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 20, + corpus_instruction="This is a corpus instruction", + document_embeddings = self.doc_embeddings, + vocab_embeddings= self.embeddings_vocab) + self.assertTrue(isinstance(topicgpt, TopicGPT)) + + # check if assertions are triggered + + with self.assertRaises(AssertionError): + topicgpt = TopicGPT(api_key = None, + n_topics= 32, + openai_prompting_model="gpt-4", + max_number_of_tokens=8000, + corpus_instruction="This is a corpus instruction") + + with self.assertRaises(AssertionError): + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 0, + max_number_of_tokens=8000, + corpus_instruction="This is a corpus instruction") + + with self.assertRaises(AssertionError): + topicgpt = TopicGPT(api_key = self.api_key_openai, + n_topics= 20, + max_number_of_tokens=0, + corpus_instruction="This is a corpus instruction") + + def test_fit(self): + """ + test the fit function of the TopicGPT class + """ + print("Testing fit...") + + def instance_test(topicgpt): + topicgpt.fit(self.corpus) + + self.assertTrue(hasattr(topicgpt, "vocab")) + self.assertTrue(hasattr(topicgpt, "topic_lis")) + + self.assertTrue(isinstance(topicgpt.vocab, list)) + self.assertTrue(isinstance(topicgpt.vocab[0], str)) + + self.assertTrue(isinstance(topicgpt.topic_lis, list)) + try: + self.assertTrue(type(topicgpt.topic_lis[0]) == Topic) + except AssertionError as e: + print(e) + print(type(topicgpt.topic_lis[0])) + print(topicgpt.topic_lis[0]) + + if topicgpt.n_topics is not None: + self.assertTrue(len(topicgpt.topic_lis) == topicgpt.n_topics) + + self.assertTrue(topicgpt.topic_lis == topicgpt.topic_prompting.topic_lis) + self.assertTrue(topicgpt.vocab == topicgpt.topic_prompting.vocab) + self.assertTrue(topicgpt.vocab_embeddings == topicgpt.topic_prompting.vocab_embeddings) + + + topicgpt1 = TopicGPT(api_key = self.api_key_openai, + n_topics= 20, + document_embeddings = self.doc_embeddings, + vocab_embeddings = self.embeddings_vocab) + + topicgpt2 = TopicGPT(api_key = self.api_key_openai, + n_topics= None, + document_embeddings = self.doc_embeddings, + vocab_embeddings = self.embeddings_vocab) + + topicgpt3 = TopicGPT(api_key=self.api_key_openai, + n_topics = 1, + document_embeddings = self.doc_embeddings, + vocab_embeddings = self.embeddings_vocab, + n_topwords=10, + n_topwords_description=10, + topword_extraction_methods=["cosine_similarity"]) + + clusterer4 = Clustering_and_DimRed( + n_dims_umap = 10, + n_neighbors_umap = 20, + min_cluster_size_hdbscan = 10, + number_clusters_hdbscan= 10 # use only 10 clusters + ) + + topword_enhancement4 = TopwordEnhancement(api_key = self.api_key_openai) + topic_prompting4 = TopicPrompting( + api_key = self.api_key_openai, + enhancer = topword_enhancement4, + topic_lis = None + ) + + topicgpt4 = TopicGPT(api_key=self.api_key_openai, + n_topics= None, + document_embeddings = self.doc_embeddings, + vocab_embeddings = self.embeddings_vocab, + topic_prompting = topic_prompting4, + clusterer = clusterer4, + topword_extraction_methods=["tfidf"]) + + + topic_gpt_list = [topicgpt1, topicgpt2, topicgpt3, topicgpt4] + + for topic_gpt in topic_gpt_list: + instance_test(topic_gpt) + + + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/LLMTopicDetection_TopicGPT/test/TestTopicGPT_prompting.py b/LLMTopicDetection_TopicGPT/test/TestTopicGPT_prompting.py new file mode 100644 index 0000000..ea72dfd --- /dev/null +++ b/LLMTopicDetection_TopicGPT/test/TestTopicGPT_prompting.py @@ -0,0 +1,476 @@ +""" +This class is used to test the init and fit functions of the TopicGPT class +""" + + +import os +import sys +import inspect +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentdir = os.path.dirname(currentdir) + +sys.path.insert(0, f"{parentdir}/src") +from topicgpt.TopicGPT import TopicGPT + +sys.path.insert(0, parentdir) + +import openai +import pickle + +import unittest + +from src.topicgpt.TopicRepresentation import Topic + +from src.topicgpt.Clustering import Clustering_and_DimRed +from src.topicgpt.TopwordEnhancement import TopwordEnhancement +from src.topicgpt.TopicPrompting import TopicPrompting + + +# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.environ.get('OPENAI_ORG'))' +# openai.organization = os.environ.get('OPENAI_ORG') + +class TestTopicGPT_prompting(unittest.TestCase): + """ + This class is used to mainly test the prompting functionality of the TopicGPT class. + """ + + @classmethod + def setUp(self): + """ + load the necessary topic prompting object + """ + + print("Setting up class...") + try: + with open("Data/SavedTopicRepresentations/TopicGpt_20ng.pkl", "rb") as f: + self.topicgpt = pickle.load(f) + except FileNotFoundError: + with open("../Data/SavedTopicRepresentations/TopicGpt_20ng.pkl", "rb") as f: + self.topicgpt = pickle.load(f) + + print(f"The topic list of this object is: \n {self.topicgpt.topic_lis} \n\n") + + def test_visualize_clusters(self): + """ + test the visualize_clusters function of the TopicGPT class + """ + print("Testing visualize_clusters...") + self.topicgpt.visualize_clusters() + + def test_repr_topics(self): + """ + test the repr_topics function of the TopicGPT class + """ + print("Testing repr_topics...") + self.assertTrue(type(self.topicgpt.repr_topics()) == str) + + def test_promt_knn_search(self): + """ + test the ppromt function that calls knn_search of the TopicPrompting class + """ + print("Testing ppromt_knn_search...") + + prompt_lis = ["Is topic 0 about Bananas? Use knn Search", + "Is topic 0 about Space? Use knn Search", + "Is topic 13 about Space exploration? Use knn Search"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result[0]) == list) + self.assertTrue(type(function_result[1]) == list) + self.assertTrue(type(function_result[0][0]) == str) + self.assertTrue(type(function_result[1][0]) == int) + + def test_promt_identify_topic_idx(self): + """ + test the ppromt function that calls identify_topic_idx of the TopicPrompting class + """ + + print("Testing ppromt_identify_topic_idx...") + prompt_lis = ["What is the index of the topic about Space?", + "What is the index of the topic about cars?", + "What is the index of the topic about gun control?"] + correct_indices = [13, 9, 2] + + for prompt, correct_idx in zip(prompt_lis, correct_indices): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == int) + self.assertTrue(function_result == correct_idx) # topic 14 is about space + + def test_prompt_identify_topc_idx_no_index_prompt(self): + """ + test the ppromt function that calls identify_topic_idx of the TopicPrompting class + """ + + print("Testing ppromt_identify_topic_idx...") + no_index_prompt = "What is the index of the topic about bananas?" + + answer, function_result = self.topicgpt.prompt(no_index_prompt) + + print(f"Answer to the prompt '{no_index_prompt}' \n is \n '{answer}'") + self.assertTrue(type(answer) == str) + self.assertTrue(function_result == None) + + def test_prompt_split_topic_kmeans(self): + """ + test the ppromt function that calls split_topic_kmeans of the TopicPrompting class + """ + + print("Testing ppromt_split_topic_kmeans...") + + prompt_lis = ["Split topic 0 into 2 subtopics using kmeans", + "Split topic 1 into 3 subtopics using kmeans", + "Split topic 2 into 4 subtopics using kmeans"] + added_topic_lis_len = [2, 3, 4] + + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == added_topic_len + len(self.topicgpt.topic_lis) -1 ) + + def test_prompt_split_topic_kmeans_inplace(self): + """ + test the ppromt function that calls split_topic_kmeans of the TopicPrompting class + """ + + print("Testing ppromt_split_topic_kmeans...") + + prompt_lis = ["Split topic 0 into 2 subtopics using kmeans. Do this inplace"] + added_topic_lis_len = [2] + + old_number_of_topics = len(self.topicgpt.topic_lis) + + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_of_topics + added_topic_len -1 ) + self.assertTrue(self.topicgpt.topic_lis == function_result) + + def test_prompt_split_topic_hdbscan(self): + """ + test the ppromt function that calls split_topic_hdbscan of the TopicPrompting class + """ + + print("Testing ppromt_split_topic_hdbscan...") + + prompt_lis = ["Split topic 0 into subtopics using hdbscan", + "Split topic 1 into subtopics using hdbscan", + "Split topic 2 into subtopics using hdbscan"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + + def test_prompt_split_topic_hdbscan_inplace(self): + """ + test the ppromt function that calls split_topic_hdbscan of the TopicPrompting class + """ + + print("Testing ppromt_split_topic_hdbscan...") + + prompt_lis = ["Split topic 4 into subtopics using hdbscan. Do this inplace"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + + self.assertTrue(self.topicgpt.topic_lis == function_result) + + def test_prompt_split_topic_keywords(self): + """ + test the prompt function that calls split_topic_keywords of the TopicPrompting class. This test works almost the same as the test_prompt_split_topic_kmeans + """ + + print("Testing ppromt_split_topic_keywords...") + + prompt_lis = ["Split topic 0 into 2 subtopics based on the keywords Technology and Computers", + "Split topic 14 into two subbtopics based on the keywords Space and Exploration"] + + added_topic_lis_len = [2, 2] + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + print(type(function_result[0])) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == added_topic_len + len(self.topicgpt.topic_lis) -1 ) + + def test_prompt_split_topic_keywords_inplace(self): + """ + test the prompt function that calls split_topic_keywords of the TopicPrompting class. This test works almost the same as the test_prompt_split_topic_kmeans + """ + + print("Testing ppromt_split_topic_keywords...") + + prompt_lis = ["Split topic 0 into 2 subtopics based on the keywords Technology and Computers. Do this inplace"] + + added_topic_lis_len = [2] + old_number_of_topics = len(self.topicgpt.topic_lis) + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_of_topics + added_topic_len -1 ) + self.assertTrue(self.topicgpt.topic_lis == function_result) + + def test_prompt_split_topic_single_keyword(self): + """ + test the prompt function that calls split_topic_keywords of the TopicPrompting class. This test works almost the same as the test_prompt_split_topic_kmeans + """ + + print("Testing ppromt_split_topic_keywords...") + + prompt_lis = ["Split topic into two topics using the additional keyword 'Technology'", + "Split topic into two topics using the additional keyword 'Space'"] + + added_topic_lis_len = [2, 2] + + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == added_topic_len + len(self.topicgpt.topic_lis) -1 ) + + def test_prompt_split_topic_single_keyword_inplace(self): + """ + test the prompt function that calls split_topic_keywords of the TopicPrompting class. This test works almost the same as the test_prompt_split_topic_kmeans + """ + + print("Testing ppromt_split_topic_keywords...") + + prompt_lis = ["Split topic 0 into 2 subtopics based on the keywords Technology and Computers. Do this inplace"] + + added_topic_lis_len = [2] + old_number_of_topics = len(self.topicgpt.topic_lis) + for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len): + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_of_topics + added_topic_len -1 ) + self.assertTrue(self.topicgpt.topic_lis == function_result) + + def test_prompt_combine_topics(self): + """ + test the prompt function that calls combine_topics of the TopicPrompting class + """ + + print("Testing ppromt_combine_topics...") + + prompt_lis = ["Combine topic 0 and topic 1 into one topic", + "Combine topic 1 and topic 2 into one topic", + "Combine topic 2 and topic 3 into one topic"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == len(self.topicgpt.topic_lis) -1) + + def test_prompt_combine_topics_inplace(self): + """ + test the prompt function that calls combine_topics of the TopicPrompting class + """ + + print("Testing ppromt_combine_topics...") + + prompt_lis = ["Combine topic 0 and topic 1 into one topic. Do this inplace"] + old_number_topics = len(self.topicgpt.topic_lis) + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + print("topic_gpt_topic_list: ", self.topicgpt.topic_lis) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(self.topicgpt.topic_lis == function_result) + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_topics -1) + + def test_prompt_add_new_topic_keyword(self): + """ + test the prompt function that calls add_new_topic_keyword of the TopicPrompting class + """ + + print("Testing ppromt_add_new_topic_keyword...") + + prompt_lis = ["Add a new topic with the keyword 'Politics'", + "Add a new topic with the keyword 'Climate Change'", + "Add a new topic with the keyword 'Computers'"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + print(type(function_result[0])) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == len(self.topicgpt.topic_lis) +1) + + def test_prompt_add_new_topic_keyword_inplace(self): + """ + test the prompt function that calls add_new_topic_keyword of the TopicPrompting class + """ + + print("Testing ppromt_add_new_topic_keyword...") + + prompt_lis = ["Add a new topic with the keyword 'Politics'. Do this inplace"] + old_number_topics = len(self.topicgpt.topic_lis) + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(self.topicgpt.topic_lis == function_result) + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_topics +1) + + def test_prompt_delete_topic(self): + """ + test the prompt function that calls delete_topic of the TopicPrompting class + """ + + print("Testing ppromt_delete_topic...") + + prompt_lis = ["Delete topic 0", + "Delete topic 1", + "Delete topic 2"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(len(function_result) == len(self.topicgpt.topic_lis) -1) + + def test_prompt_delete_topic_inplace(self): + """ + test the prompt function that calls delete_topic of the TopicPrompting class + """ + + print("Testing ppromt_delete_topic...") + + prompt_lis = ["Delete topic 0. Do this inplace"] + old_number_topics = len(self.topicgpt.topic_lis) + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == list) + self.assertTrue(type(function_result[0]) == Topic) + self.assertTrue(self.topicgpt.topic_lis == function_result) + self.assertTrue(len(self.topicgpt.topic_lis) == old_number_topics -1) + + def test_prompt_get_topic_information(self): + """ + test the get_topic_information function of the TopicGPT class + """ + + print("Testing get_topic_information...") + + prompt_lis = ["Please compare topic 0 and topic 1", + "Please compare topic 3,4,5"] + + for prompt in prompt_lis: + + answer, function_result = self.topicgpt.prompt(prompt) + + print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'") + print("function_result: ", function_result) + + self.assertTrue(type(answer) == str) + self.assertTrue(type(function_result) == dict) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file