From a21d9cc768598640f38e4bd03d368f8712a9aa77 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 17 Jan 2021 18:11:30 +0900 Subject: [PATCH] Remove wrappers and other cruft (#2972) * git rm docs/src/simserver.rst * rm -r gensim/models/wrappers * get rid of wrappers import * scrub reference to wrappers from docstring * git rm -r docs/src/models/wrappers * git rm gensim/test/test_coherencemodel.py This file relies on wrappers that aren't there anymore. * rm -r gensim/models/wrappers * rm wrapper and sklearn_api docs * git rm gensim/test/test_sklearn_api.py * bump Cython version to 0.29.21 * decrease tox verbosity * get rid of more six remnants * rm -rf gensim/sklearn_api --- .travis.yml | 2 +- azure-pipelines.yml | 2 +- docs/src/apiref.rst | 16 - docs/src/models/wrappers/dtmmodel.rst | 9 - docs/src/models/wrappers/ldamallet.rst | 9 - docs/src/models/wrappers/ldavowpalwabbit.rst | 9 - docs/src/models/wrappers/varembed.rst | 9 - docs/src/models/wrappers/wordrank.rst | 9 - docs/src/models/wrappers/wrappers.rst | 10 - docs/src/simserver.rst | 362 ----- docs/src/sklearn_api/atmodel.rst | 9 - docs/src/sklearn_api/d2vmodel.rst | 9 - docs/src/sklearn_api/hdp.rst | 9 - docs/src/sklearn_api/ldamodel.rst | 9 - docs/src/sklearn_api/ldaseqmodel.rst | 9 - docs/src/sklearn_api/lsimodel.rst | 9 - docs/src/sklearn_api/phrases.rst | 9 - docs/src/sklearn_api/rpmodel.rst | 9 - docs/src/sklearn_api/text2bow.rst | 9 - docs/src/sklearn_api/tfidf.rst | 9 - docs/src/sklearn_api/w2vmodel.rst | 9 - gensim/corpora/_mmreader.pyx | 6 +- gensim/models/__init__.py | 2 - gensim/models/tfidfmodel.py | 5 - gensim/models/word2vec.py | 3 +- gensim/models/wrappers/__init__.py | 9 - gensim/models/wrappers/dtmmodel.py | 613 -------- gensim/models/wrappers/ldamallet.py | 611 -------- gensim/models/wrappers/ldavowpalwabbit.py | 888 ----------- gensim/models/wrappers/varembed.py | 129 -- gensim/models/wrappers/wordrank.py | 322 ---- gensim/sklearn_api/__init__.py | 24 - gensim/sklearn_api/atmodel.py | 223 --- gensim/sklearn_api/d2vmodel.py | 202 --- gensim/sklearn_api/ftmodel.py | 228 --- gensim/sklearn_api/hdp.py | 198 --- gensim/sklearn_api/ldamodel.py | 246 --- gensim/sklearn_api/ldaseqmodel.py | 148 -- gensim/sklearn_api/lsimodel.py | 164 -- gensim/sklearn_api/phrases.py | 200 --- gensim/sklearn_api/rpmodel.py | 98 -- gensim/sklearn_api/text2bow.py | 122 -- gensim/sklearn_api/tfidf.py | 161 -- gensim/sklearn_api/w2vmodel.py | 186 --- gensim/test/test_coherencemodel.py | 364 ----- gensim/test/test_d2vmodel.py | 57 - gensim/test/test_ldamallet_wrapper.py | 215 --- gensim/test/test_ldavowpalwabbit_wrapper.py | 213 --- gensim/test/test_sklearn_api.py | 1397 ------------------ gensim/test/test_varembed_wrapper.py | 73 - gensim/test/test_wordrank_wrapper.py | 79 - setup.py | 2 - 52 files changed, 4 insertions(+), 7720 deletions(-) delete mode 100644 docs/src/models/wrappers/dtmmodel.rst delete mode 100644 docs/src/models/wrappers/ldamallet.rst delete mode 100644 docs/src/models/wrappers/ldavowpalwabbit.rst delete mode 100644 docs/src/models/wrappers/varembed.rst delete mode 100644 docs/src/models/wrappers/wordrank.rst delete mode 100644 docs/src/models/wrappers/wrappers.rst delete mode 100644 docs/src/simserver.rst delete mode 100644 docs/src/sklearn_api/atmodel.rst delete mode 100644 docs/src/sklearn_api/d2vmodel.rst delete mode 100644 docs/src/sklearn_api/hdp.rst delete mode 100644 docs/src/sklearn_api/ldamodel.rst delete mode 100644 docs/src/sklearn_api/ldaseqmodel.rst delete mode 100644 docs/src/sklearn_api/lsimodel.rst delete mode 100644 docs/src/sklearn_api/phrases.rst delete mode 100644 docs/src/sklearn_api/rpmodel.rst delete mode 100644 docs/src/sklearn_api/text2bow.rst delete mode 100644 docs/src/sklearn_api/tfidf.rst delete mode 100644 docs/src/sklearn_api/w2vmodel.rst delete mode 100644 gensim/models/wrappers/__init__.py delete mode 100644 gensim/models/wrappers/dtmmodel.py delete mode 100644 gensim/models/wrappers/ldamallet.py delete mode 100644 gensim/models/wrappers/ldavowpalwabbit.py delete mode 100644 gensim/models/wrappers/varembed.py delete mode 100644 gensim/models/wrappers/wordrank.py delete mode 100644 gensim/sklearn_api/__init__.py delete mode 100644 gensim/sklearn_api/atmodel.py delete mode 100644 gensim/sklearn_api/d2vmodel.py delete mode 100644 gensim/sklearn_api/ftmodel.py delete mode 100644 gensim/sklearn_api/hdp.py delete mode 100644 gensim/sklearn_api/ldamodel.py delete mode 100644 gensim/sklearn_api/ldaseqmodel.py delete mode 100644 gensim/sklearn_api/lsimodel.py delete mode 100644 gensim/sklearn_api/phrases.py delete mode 100644 gensim/sklearn_api/rpmodel.py delete mode 100644 gensim/sklearn_api/text2bow.py delete mode 100644 gensim/sklearn_api/tfidf.py delete mode 100644 gensim/sklearn_api/w2vmodel.py delete mode 100644 gensim/test/test_coherencemodel.py delete mode 100644 gensim/test/test_d2vmodel.py delete mode 100644 gensim/test/test_ldamallet_wrapper.py delete mode 100644 gensim/test/test_ldavowpalwabbit_wrapper.py delete mode 100644 gensim/test/test_sklearn_api.py delete mode 100644 gensim/test/test_varembed_wrapper.py delete mode 100644 gensim/test/test_wordrank_wrapper.py diff --git a/.travis.yml b/.travis.yml index 553a8215d8..1fe894a021 100644 --- a/.travis.yml +++ b/.travis.yml @@ -44,7 +44,7 @@ before_script: - ulimit -c unlimited -S # enable core dumps -script: tox -vv +script: tox after_failure: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index c624e7e3e9..dfe8f22fb4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -25,5 +25,5 @@ steps: displayName: 'Install tox' - script: | - tox -vv + tox displayName: 'Testing' diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index d6cdeeaf52..095b403aad 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -57,26 +57,10 @@ Modules: models/word2vec_inner models/doc2vec_inner models/fasttext_inner - models/wrappers/ldamallet - models/wrappers/dtmmodel - models/wrappers/ldavowpalwabbit.rst - models/wrappers/wordrank - models/wrappers/varembed similarities/docsim similarities/termsim similarities/annoy similarities/nmslib - sklearn_api/atmodel - sklearn_api/d2vmodel - sklearn_api/hdp - sklearn_api/ldamodel - sklearn_api/ldaseqmodel - sklearn_api/lsimodel - sklearn_api/phrases - sklearn_api/rpmodel - sklearn_api/text2bow - sklearn_api/tfidf - sklearn_api/w2vmodel test/utils topic_coherence/aggregation topic_coherence/direct_confirmation_measure diff --git a/docs/src/models/wrappers/dtmmodel.rst b/docs/src/models/wrappers/dtmmodel.rst deleted file mode 100644 index 00db4c105a..0000000000 --- a/docs/src/models/wrappers/dtmmodel.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.wrappers.dtmmodel` -- Dynamic Topic Models (DTM) and Dynamic Influence Models (DIM) -================================================================================================ - -.. automodule:: gensim.models.wrappers.dtmmodel - :synopsis: Dynamic Topic Models - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/wrappers/ldamallet.rst b/docs/src/models/wrappers/ldamallet.rst deleted file mode 100644 index 309fbd9cd1..0000000000 --- a/docs/src/models/wrappers/ldamallet.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.wrappers.ldamallet` -- Latent Dirichlet Allocation via Mallet -========================================================================== - -.. automodule:: gensim.models.wrappers.ldamallet - :synopsis: Latent Dirichlet Allocation via Mallet - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/wrappers/ldavowpalwabbit.rst b/docs/src/models/wrappers/ldavowpalwabbit.rst deleted file mode 100644 index 4199184153..0000000000 --- a/docs/src/models/wrappers/ldavowpalwabbit.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.wrappers.ldavowpalwabbit` -- Latent Dirichlet Allocation via Vowpal Wabbit -======================================================================================= - -.. automodule:: gensim.models.wrappers.ldavowpalwabbit - :synopsis: Latent Dirichlet Allocation via Vowpal Wabbit - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/wrappers/varembed.rst b/docs/src/models/wrappers/varembed.rst deleted file mode 100644 index 411025582d..0000000000 --- a/docs/src/models/wrappers/varembed.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.wrappers.varembed` -- VarEmbed Word Embeddings -================================================================================================ - -.. automodule:: gensim.models.wrappers.varembed - :synopsis: VarEmbed Word Embeddings - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/wrappers/wordrank.rst b/docs/src/models/wrappers/wordrank.rst deleted file mode 100644 index 25f791ab88..0000000000 --- a/docs/src/models/wrappers/wordrank.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.wrappers.wordrank` -- Word Embeddings from WordRank -================================================================================================ - -.. automodule:: gensim.models.wrappers.wordrank - :synopsis: Wordrank Embeddings - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/wrappers/wrappers.rst b/docs/src/models/wrappers/wrappers.rst deleted file mode 100644 index 9746202d6d..0000000000 --- a/docs/src/models/wrappers/wrappers.rst +++ /dev/null @@ -1,10 +0,0 @@ -:orphan: - -:mod:`models.wrappers` -- Package for transformation models via external programs -================================================================================= - -.. automodule:: gensim.models.wrappers - :synopsis: Package for transformation models via external programs - :members: - :inherited-members: - diff --git a/docs/src/simserver.rst b/docs/src/simserver.rst deleted file mode 100644 index 20bdfd83f8..0000000000 --- a/docs/src/simserver.rst +++ /dev/null @@ -1,362 +0,0 @@ -:orphan: - -.. _simserver: - -Document Similarity Server -============================= - -The 0.7.x series of `gensim `_ was about improving performance and consolidating API. -0.8.x will be about new features --- 0.8.1, first of the series, is a **document similarity service**. - -The source code itself has been moved from gensim to its own, dedicated package, named `simserver`. -Get it from `PyPI `_ or clone it on `Github `_. - -What is a document similarity service? ---------------------------------------- - -Conceptually, a service that lets you : - -1. train a semantic model from a corpus of plain texts (no manual annotation and mark-up needed) -2. index arbitrary documents using this semantic model -3. query the index for similar documents (the query can be either an id of a document already in the index, or an arbitrary text) - - .. sourcecode:: pycon - - >>> from simserver import SessionServer - >>> server = SessionServer('/tmp/my_server') # resume server (or create a new one) - >>> - >>> server.train(training_corpus, method='lsi') # create a semantic model - >>> server.index(some_documents) # convert plain text to semantic representation and index it - >>> server.find_similar(query) # convert query to semantic representation and compare against index - >>> - >>> server.index(more_documents) # add to index: incremental indexing works - >>> server.find_similar(query) - >>> - >>> server.delete(ids_to_delete) # incremental deleting also works - >>> server.find_similar(query) - -.. note:: - "Semantic" here refers to semantics of the crude, statistical type -- - `Latent Semantic Analysis `_, - `Latent Dirichlet Allocation `_ etc. - Nothing to do with the semantic web, manual resource tagging or detailed linguistic inference. - - -What is it good for? ---------------------- - -Digital libraries of (mostly) text documents. More generally, it helps you annotate, -organize and navigate documents in a more abstract way, compared to plain keyword search. - -How is it unique? ------------------ - -1. **Memory independent**. Gensim has unique algorithms for statistical analysis that allow - you to create semantic models of arbitrarily large training corpora (larger than RAM) very quickly - and in constant RAM. -2. **Memory independent (again)**. Indexing shards are stored as files to disk/mmapped back as needed, - so you can index very large corpora. So again, constant RAM, this time independent of the number of indexed documents. -3. **Efficient**. Gensim makes heavy use of Python's NumPy and SciPy libraries to make indexing and - querying efficient. -4. **Robust**. Modifications of the index are transactional, so you can commit/rollback an - entire indexing session. Also, during the session, the service is still available - for querying (using its state from when the session started). Power failures leave - service in a consistent state (implicit rollback). -5. **Pure Python**. Well, technically, NumPy and SciPy are mostly wrapped C and Fortran, but - `gensim `_ itself is pure Python. No compiling, installing or root priviledges needed. -6. **Concurrency support**. The underlying service object is thread-safe and can - therefore be used as a daemon server: clients connect to it via RPC and issue train/index/query requests remotely. -7. **Cross-network, cross-platform and cross-language**. While the Python server runs - over TCP using `Pyro `_, - clients in Java/.NET are trivial thanks to `Pyrolite `_. - -The rest of this document serves as a tutorial explaining the features in more detail. - ------ - -Prerequisites ----------------------- - -It is assumed you have `gensim` properly installed. You'll also -need the `sqlitedict `_ package that wraps -Python's sqlite3 module in a thread-safe manner:: - - $ pip install sqlitedict - -To test the remote server capabilities, install Pyro4 (Python Remote Objects, at -version 4.8 as of this writing):: - - $ pip install Pyro4 - -.. note:: - Don't forget to initialize logging to see logging messages: - - .. sourcecode:: pycon - - >>> import logging - >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - -What is a document? -------------------- - -In case of text documents, the service expects: - -.. sourcecode:: pycon - - >>> document = {'id': 'some_unique_string', - >>> 'tokens': ['content', 'of', 'the', 'document', '...'], - >>> 'other_fields_are_allowed_but_ignored': None} - -This format was chosen because it coincides with plain JSON and is therefore easy to serialize and send over the wire, in almost any language. -All strings involved must be utf8-encoded. - - -What is a corpus? ------------------ - -A sequence of documents. Anything that supports the `for document in corpus: ...` -iterator protocol. Generators are ok. Plain lists are also ok (but consume more memory). - -.. sourcecode:: pycon - - >>> from gensim import utils - >>> - >>> texts = ["Human machine interface for lab abc computer applications", - >>> "A survey of user opinion of computer system response time", - >>> "The EPS user interface management system", - >>> "System and human system engineering testing of EPS", - >>> "Relation of user perceived response time to error measurement", - >>> "The generation of random binary unordered trees", - >>> "The intersection graph of paths in trees", - >>> "Graph minors IV Widths of trees and well quasi ordering", - >>> "Graph minors A survey"] - >>> - >>> corpus = [{'id': 'doc_%i' % num, 'tokens': utils.simple_preprocess(text)} - >>> for num, text in enumerate(texts)] - -Since corpora are allowed to be arbitrarily large, it is -recommended client splits them into smaller chunks before uploading them to the server: - -.. sourcecode:: pycon - - >>> utils.upload_chunked(server, corpus, chunksize=1000) # send 1k docs at a time - -Wait, upload what, where? -------------------------- - -If you use the similarity service object (instance of :class:`simserver.SessionServer`) in -your code directly---no remote access---that's perfectly fine. Using the service remotely, from a different process/machine, is an -option, not a necessity. - -Document similarity can also act as a long-running service, a daemon process on a separate machine. In that -case, I'll call the service object a *server*. - -But let's start with a local object. Open your `favourite shell `_ and - -.. sourcecode:: pycon - - >>> from simserver import SessionServer - >>> - >>> service = SessionServer('/tmp/my_server/') # or wherever - -That initialized a new service, located in `/tmp/my_server` (you need write access rights to that directory). - -.. note:: - The service is fully defined by the content of its location directory ("`/tmp/my_server/`"). - If you use an existing location, the service object will resume - from the index found there. Also, to "clone" a service, just copy that - directory somewhere else. The copy will be a fully working duplicate of the - original service. - - -Model training ---------------- - -We can start indexing right away: - -.. sourcecode:: pycon - - >>> service.index(corpus) - AttributeError: must initialize model for /tmp/my_server/b before indexing documents - -Oops, we can not. The service indexes documents in a semantic representation, which -is different to the plain text we give it. We must teach the service how to convert -between plain text and semantics first: - -.. sourcecode:: pycon - - >>> service.train(corpus, method='lsi') - -That was easy. The `method='lsi'` parameter meant that we trained a model for -`Latent Semantic Indexing `_ -and default dimensionality (400) over a `tf-idf `_ -representation of our little `corpus`, all automatically. More on that later. - -Note that for the semantic model to make sense, it should be trained -on a corpus that is: - -* Reasonably similar to the documents you want to index later. Training on a corpus - of recipes in French when all indexed documents will be about programming in English - will not help. -* Reasonably large (at least thousands of documents), so that the statistical analysis has - a chance to kick in. Don't use my example corpus here of 9 documents in production O_o - -Indexing documents ------------------- - -.. sourcecode:: pycon - - >>> service.index(corpus) # index the same documents that we trained on... - -Indexing can happen over any documents, but I'm too lazy to create another example corpus, so we index the same 9 docs used for training. - -Delete documents with: - -.. sourcecode:: pycon - - >>> service.delete(['doc_5', 'doc_8']) # supply a list of document ids to be removed from the index - -When you pass documents that have the same id as some already indexed document, -the indexed document is overwritten by the new input (=only the latest counts; -document ids are always unique per service): - -.. sourcecode:: pycon - - >>> service.index(corpus[:3]) # overall index size unchanged (just 3 docs overwritten) - -The index/delete/overwrite calls can be arbitrarily interspersed with queries. -You don't have to index **all** documents first to start querying, indexing can be incremental. - -Querying ---------- - -There are two types of queries: - -1. by id: - - .. sourcecode:: pycon - - >>> print(service.find_similar('doc_0')) - [('doc_0', 1.0, None), ('doc_2', 0.30426699, None), ('doc_1', 0.25648531, None), ('doc_3', 0.25480536, None)] - >>> - >>> print(service.find_similar('doc_5')) # we deleted doc_5 and doc_8, remember? - ValueError: document 'doc_5' not in index - - In the resulting 3-tuples, `doc_n` is the document id we supplied during indexing, - `0.30426699` is the similarity of `doc_n` to the query, but what's up with that `None`, you ask? - Well, you can associate each document with a "payload", during indexing. - This payload object (anything pickle-able) is later returned during querying. - If you don't specify `doc['payload']` during indexing, queries simply return `None` in the result tuple, as in our example here. - -2. or by document (using `document['tokens']`; id is ignored in this case): - - .. sourcecode:: pycon - - >>> doc = {'tokens': utils.simple_preprocess('Graph and minors and humans and trees.')} - >>> print(service.find_similar(doc, min_score=0.4, max_results=50)) - [('doc_7', 0.93350589, None), ('doc_3', 0.42718196, None)] - -Remote access -------------- - -So far, we did everything in our Python shell, locally. I very much like `Pyro `_, -a pure Python package for Remote Procedure Calls (RPC), so I'll illustrate remote -service access via Pyro. Pyro takes care of all the socket listening/request routing/data marshalling/thread -spawning, so it saves us a lot of trouble. - -To create a similarity server, we just create a :class:`simserver.SessionServer` object and register it -with a Pyro daemon for remote access. There is a small `example script `_ -included with simserver, run it with:: - - $ python -m simserver.run_simserver /tmp/testserver - -You can just `ctrl+c` to terminate the server, but leave it running for now. - -Now open your Python shell again, in another terminal window or possibly on another machine, and - -.. sourcecode:: pycon - - >>> import Pyro4 - >>> service = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) - -Now `service` is only a proxy object: every call is physically executed wherever -you ran the `run_server.py` script, which can be a totally different computer -(within a network broadcast domain), but you don't even know: - -.. sourcecode:: pycon - - >>> print(service.status()) - >>> service.train(corpus) - >>> service.index(other_corpus) - >>> service.find_similar(query) - -It is worth mentioning that Irmen, the author of Pyro, also released -`Pyrolite `_ recently. That is a package -which allows you to create Pyro proxies also from Java and .NET, in addition to Python. -That way you can call remote methods from there too---the client doesn't have to be in Python. - -Concurrency ------------ - -Ok, now it's getting interesting. Since we can access the service remotely, what -happens if multiple clients create proxies to it at the same time? What if they -want to modify the server index at the same time? - -Answer: the `SessionServer` object is thread-safe, so that when each client spawns a request -thread via Pyro, they don't step on each other's toes. - -This means that: - -1. There can be multiple simultaneous `service.find_similar` queries (or, in - general, multiple simultaneus calls that are "read-only"). -2. When two clients issue modification calls (`index`/`train`/`delete`/`drop_index`/...) - at the same time, an internal lock serializes them -- the later call has to wait. -3. While one client is modifying the index, all other clients' queries still see - the original index. Only once the modifications are committed do they become - "visible". - -What do you mean, visible? --------------------------- - -The service uses transactions internally. This means that each modification is -done over a clone of the service. If the modification session fails for whatever -reason (exception in code; power failure that turns off the server; client unhappy -with how the session went), it can be rolled back. It also means other clients can -continue querying the original index during index updates. - -The mechanism is hidden from users by default through auto-committing (it was already happening -in the examples above too), but auto-committing can be turned off explicitly - -.. sourcecode:: pycon - - >>> service.set_autosession(False) - >>> service.train(corpus) - RuntimeError: must open a session before modifying SessionServer - >>> service.open_session() - >>> service.train(corpus) - >>> service.index(corpus) - >>> service.delete(doc_ids) - -None of these changes are visible to other clients, yet. Also, other clients' -calls to index/train/etc will block until this session is committed/rolled back---there -cannot be two open sessions at the same time. - -To end a session - -.. sourcecode:: pycon - - >>> service.rollback() # discard all changes since open_session() - -or - -.. sourcecode:: pycon - - >>> service.commit() # make changes public; now other clients can see changes/acquire the modification lock - - -Other stuff ------------- - -TODO Custom document parsing (in lieu of `utils.simple_preprocess`). Different models (not just `lsi`). Optimizing the index with `service.optimize()`. -TODO add some hard numbers; example tutorial for some bigger collection, e.g. for `arxiv.org `_ or wikipedia. - diff --git a/docs/src/sklearn_api/atmodel.rst b/docs/src/sklearn_api/atmodel.rst deleted file mode 100644 index 4f935f0498..0000000000 --- a/docs/src/sklearn_api/atmodel.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`sklearn_api.atmodel` -- Scikit learn wrapper for Author-topic model -========================================================================= - -.. automodule:: gensim.sklearn_api.atmodel - :synopsis: Scikit learn wrapper for Author-topic model - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/sklearn_api/d2vmodel.rst b/docs/src/sklearn_api/d2vmodel.rst deleted file mode 100644 index 707daae639..0000000000 --- a/docs/src/sklearn_api/d2vmodel.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`sklearn_api.d2vmodel` -- Scikit learn wrapper for paragraph2vec model -=========================================================================== - -.. automodule:: gensim.sklearn_api.d2vmodel - :synopsis: Scikit learn wrapper for paragraph2vec model - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/sklearn_api/hdp.rst b/docs/src/sklearn_api/hdp.rst deleted file mode 100644 index 80e0fb0c70..0000000000 --- a/docs/src/sklearn_api/hdp.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`sklearn_api.hdp` -- Scikit learn wrapper for Hierarchical Dirichlet Process model -======================================================================================= - -.. automodule:: gensim.sklearn_api.hdp - :synopsis: Scikit learn wrapper for Hierarchical Dirichlet Process model - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/sklearn_api/ldamodel.rst b/docs/src/sklearn_api/ldamodel.rst deleted file mode 100644 index 3ae03f8b61..0000000000 --- a/docs/src/sklearn_api/ldamodel.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`sklearn_api.ldamodel` -- Scikit learn wrapper for Latent Dirichlet Allocation -=================================================================================== - -.. automodule:: gensim.sklearn_api.ldamodel - :synopsis: Scikit learn wrapper for LDA model - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/sklearn_api/ldaseqmodel.rst b/docs/src/sklearn_api/ldaseqmodel.rst deleted file mode 100644 index f840c8ceec..0000000000 --- a/docs/src/sklearn_api/ldaseqmodel.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`sklearn_api.ldaseqmodel` -- Scikit learn wrapper for LdaSeq model -======================================================================= - -.. automodule:: gensim.sklearn_api.ldaseqmodel - :synopsis: Scikit learn wrapper for LdaSeq model - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/sklearn_api/lsimodel.rst b/docs/src/sklearn_api/lsimodel.rst deleted file mode 100644 index 9b2142a166..0000000000 --- a/docs/src/sklearn_api/lsimodel.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`sklearn_api.lsimodel` -- Scikit learn wrapper for Latent Semantic Indexing -================================================================================ - -.. automodule:: gensim.sklearn_api.lsimodel - :synopsis: Scikit learn wrapper for LdaSeq model - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/sklearn_api/phrases.rst b/docs/src/sklearn_api/phrases.rst deleted file mode 100644 index c48326a487..0000000000 --- a/docs/src/sklearn_api/phrases.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`sklearn_api.phrases` -- Scikit learn wrapper for phrase (collocation) detection -===================================================================================== - -.. automodule:: gensim.sklearn_api.phrases - :synopsis: Scikit learn wrapper for phrase (collocation) detection - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/sklearn_api/rpmodel.rst b/docs/src/sklearn_api/rpmodel.rst deleted file mode 100644 index 47c0f41a49..0000000000 --- a/docs/src/sklearn_api/rpmodel.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`sklearn_api.rpmodel` -- Scikit learn wrapper for Random Projection model -============================================================================== - -.. automodule:: gensim.sklearn_api.rpmodel - :synopsis: Scikit learn wrapper for Random Projection model - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/sklearn_api/text2bow.rst b/docs/src/sklearn_api/text2bow.rst deleted file mode 100644 index 80148c787b..0000000000 --- a/docs/src/sklearn_api/text2bow.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`sklearn_api.text2bow` -- Scikit learn wrapper word<->id mapping -===================================================================================== - -.. automodule:: gensim.sklearn_api.text2bow - :synopsis: Scikit learn wrapper word<->id mapping - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/sklearn_api/tfidf.rst b/docs/src/sklearn_api/tfidf.rst deleted file mode 100644 index eab16f4c6d..0000000000 --- a/docs/src/sklearn_api/tfidf.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`sklearn_api.tfidf` -- Scikit learn wrapper for TF-IDF model -========================================================================== - -.. automodule:: gensim.sklearn_api.tfidf - :synopsis: Scikit learn wrapper for TF-IDF model - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/sklearn_api/w2vmodel.rst b/docs/src/sklearn_api/w2vmodel.rst deleted file mode 100644 index cca5d078e0..0000000000 --- a/docs/src/sklearn_api/w2vmodel.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`sklearn_api.w2vmodel` -- Scikit learn wrapper for word2vec model -====================================================================== - -.. automodule:: gensim.sklearn_api.w2vmodel - :synopsis: Scikit learn wrapper for word2vec model - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/gensim/corpora/_mmreader.pyx b/gensim/corpora/_mmreader.pyx index 37ea774512..3c32797de8 100644 --- a/gensim/corpora/_mmreader.pyx +++ b/gensim/corpora/_mmreader.pyx @@ -2,16 +2,12 @@ # cython: embedsignature=True """Reader for corpus in the Matrix Market format.""" - -from __future__ import with_statement - -from gensim import utils - import logging cimport cython from libc.stdio cimport sscanf +from gensim import utils logger = logging.getLogger(__name__) diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index 075f46e23c..8aa19a0465 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -22,8 +22,6 @@ from .fasttext import FastText # noqa:F401 from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix # noqa:F401 -from . import wrappers # noqa:F401 - from gensim import interfaces, utils diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index d0e3d653ef..06ba520e90 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -69,11 +69,6 @@ def resolve_weights(smartirs): ValueError If `smartirs` is not a string of length 3 or one of the decomposed value doesn't fit the list of permissible values. - - See Also - -------- - ~gensim.sklearn_api.tfidf.TfIdfTransformer, TfidfModel : Classes that also use the SMART scheme. - """ if isinstance(smartirs, str) and re.match(r"...\....", smartirs): match = re.match(r"(?P...)\.(?P...)", smartirs) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index c53d252bf4..87e15b7d60 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -21,8 +21,7 @@ ================ There are more ways to train word vectors in Gensim than just Word2Vec. -See also :class:`~gensim.models.doc2vec.Doc2Vec`, :class:`~gensim.models.fasttext.FastText` and -wrappers for :class:`~gensim.models.wrappers.varembed.VarEmbed` and :class:`~gensim.models.wrappers.wordrank.WordRank`. +See also :class:`~gensim.models.doc2vec.Doc2Vec`, :class:`~gensim.models.fasttext.FastText`. The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/ and extended with additional functionality and diff --git a/gensim/models/wrappers/__init__.py b/gensim/models/wrappers/__init__.py deleted file mode 100644 index 330abce500..0000000000 --- a/gensim/models/wrappers/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -""" -This package contains wrappers for other topic modeling programs. -""" - -from .ldamallet import LdaMallet # noqa:F401 -from .dtmmodel import DtmModel # noqa:F401 -from .ldavowpalwabbit import LdaVowpalWabbit # noqa:F401 -from .wordrank import Wordrank # noqa:F401 -from .varembed import VarEmbed # noqa:F401 diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py deleted file mode 100644 index 7271e45cf1..0000000000 --- a/gensim/models/wrappers/dtmmodel.py +++ /dev/null @@ -1,613 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2014 Artyom Topchyan -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -# Based on Copyright (C) 2014 Radim Rehurek - - -"""Python wrapper for `Dynamic Topic Models (DTM) `_ -and the `Document Influence Model (DIM) `_. - -Installation ------------- - -You have 2 ways, how to make binaries: - -#. Use precompiled binaries for your OS version from `/magsilva/dtm/ `_ -#. Compile binaries manually from `/blei-lab/dtm `_ - (original instruction available in https://github.com/blei-lab/dtm/blob/master/README.md), or use this :: - - git clone https://github.com/blei-lab/dtm.git - sudo apt-get install libgsl0-dev - cd dtm/dtm - make - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.test.utils import common_corpus, common_dictionary - >>> from gensim.models.wrappers import DtmModel - >>> - >>> path_to_dtm_binary = "/path/to/dtm/binary" - >>> model = DtmModel( - ... path_to_dtm_binary, corpus=common_corpus, id2word=common_dictionary, - ... time_slices=[1] * len(common_corpus) - ... ) - -""" - - -import logging -import random -import warnings -import tempfile -import os -from subprocess import PIPE -import numpy as np - -from gensim import utils, corpora, matutils -from gensim.utils import check_output - -logger = logging.getLogger(__name__) - - -class DtmModel(utils.SaveLoad): - """Python wrapper using `DTM implementation `_. - - Communication between DTM and Python takes place by passing around data files on disk and executing - the DTM binary as a subprocess. - - Warnings - -------- - This is **only** python wrapper for `DTM implementation `_, - you need to install original implementation first and pass the path to binary to ``dtm_path``. - - """ - def __init__(self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100, - id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, - alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True): - """ - - Parameters - ---------- - dtm_path : str - Path to the dtm binary, e.g. `/home/username/dtm/dtm/main`. - corpus : iterable of iterable of (int, int) - Collection of texts in BoW format. - time_slices : list of int - Sequence of timestamps. - mode : {'fit', 'time'}, optional - Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time - according to a DTM, basically a held out set. - model : {'fixed', 'dtm'}, optional - Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM. - num_topics : int, optional - Number of topics. - id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional - Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`. - prefix : str, optional - Prefix for produced temporary files. - lda_sequence_min_iter : int, optional - Min iteration of LDA. - lda_sequence_max_iter : int, optional - Max iteration of LDA. - lda_max_em_iter : int, optional - Max em optimization iterations in LDA. - alpha : int, optional - Hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice. - top_chain_var : float, optional - This hyperparameter controls one of the key aspect of topic evolution which is the speed at which - these topics evolve. A smaller top_chain_var leads to similar word distributions over multiple timeslice. - - rng_seed : int, optional - Random seed. - initialize_lda : bool, optional - If True - initialize DTM with LDA. - - """ - if not os.path.isfile(dtm_path): - raise ValueError("dtm_path must point to the binary file, not to a folder") - - self.dtm_path = dtm_path - self.id2word = id2word - if self.id2word is None: - logger.warning("no word id mapping provided; initializing from corpus, assuming identity") - self.id2word = utils.dict_from_corpus(corpus) - self.num_terms = len(self.id2word) - else: - self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) - if self.num_terms == 0: - raise ValueError("cannot compute DTM over an empty collection (no terms)") - self.num_topics = num_topics - - try: - lencorpus = len(corpus) - except TypeError: - logger.warning("input corpus stream has no len(); counting documents") - lencorpus = sum(1 for _ in corpus) - if lencorpus == 0: - raise ValueError("cannot compute DTM over an empty corpus") - if model == "fixed" and any(not text for text in corpus): - raise ValueError("""There is a text without words in the input corpus. - This breaks method='fixed' (The DIM model).""") - if lencorpus != sum(time_slices): - raise ValueError( - "mismatched timeslices %{slices} for corpus of len {clen}" - .format(slices=sum(time_slices), clen=lencorpus) - ) - self.lencorpus = lencorpus - if prefix is None: - rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' - prefix = os.path.join(tempfile.gettempdir(), rand_prefix) - - self.prefix = prefix - self.time_slices = time_slices - self.lda_sequence_min_iter = int(lda_sequence_min_iter) - self.lda_sequence_max_iter = int(lda_sequence_max_iter) - self.lda_max_em_iter = int(lda_max_em_iter) - self.alpha = alpha - self.top_chain_var = top_chain_var - self.rng_seed = rng_seed - self.initialize_lda = str(initialize_lda).lower() - - self.lambda_ = None - self.obs_ = None - self.lhood_ = None - self.gamma_ = None - self.init_alpha = None - self.init_beta = None - self.init_ss = None - self.em_steps = [] - self.influences_time = [] - - if corpus is not None: - self.train(corpus, time_slices, mode, model) - - def fout_liklihoods(self): - """Get path to temporary lhood data file. - - Returns - ------- - str - Path to lhood data file. - - """ - return self.prefix + 'train_out/lda-seq/' + 'lhoods.dat' - - def fout_gamma(self): - """Get path to temporary gamma data file. - - Returns - ------- - str - Path to gamma data file. - - """ - return self.prefix + 'train_out/lda-seq/' + 'gam.dat' - - def fout_prob(self): - """Get template of path to temporary file. - - Returns - ------- - str - Path to file. - - """ - return self.prefix + 'train_out/lda-seq/' + 'topic-{i}-var-e-log-prob.dat' - - def fout_observations(self): - """Get template of path to temporary file. - - Returns - ------- - str - Path to file. - - """ - return self.prefix + 'train_out/lda-seq/' + 'topic-{i}-var-obs.dat' - - def fout_influence(self): - """Get template of path to temporary file. - - Returns - ------- - str - Path to file. - - """ - return self.prefix + 'train_out/lda-seq/' + 'influence_time-{i}' - - def foutname(self): - """Get path to temporary file. - - Returns - ------- - str - Path to file. - - """ - return self.prefix + 'train_out' - - def fem_steps(self): - """Get path to temporary em_step data file. - - Returns - ------- - str - Path to em_step data file. - - """ - return self.prefix + 'train_out/' + 'em_log.dat' - - def finit_alpha(self): - """Get path to initially trained lda alpha file. - - Returns - ------- - str - Path to initially trained lda alpha file. - - """ - return self.prefix + 'train_out/' + 'initial-lda.alpha' - - def finit_beta(self): - """Get path to initially trained lda beta file. - - Returns - ------- - str - Path to initially trained lda beta file. - - """ - return self.prefix + 'train_out/' + 'initial-lda.beta' - - def flda_ss(self): - """Get path to initial lda binary file. - - Returns - ------- - str - Path to initial lda binary file. - - """ - return self.prefix + 'train_out/' + 'initial-lda-ss.dat' - - def fcorpustxt(self): - """Get path to temporary file. - - Returns - ------- - str - Path to multiple train binary file. - - """ - return self.prefix + 'train-mult.dat' - - def fcorpus(self): - """Get path to corpus file. - - Returns - ------- - str - Path to corpus file. - - """ - return self.prefix + 'train' - - def ftimeslices(self): - """Get path to time slices binary file. - - Returns - ------- - str - Path to time slices binary file. - - """ - return self.prefix + 'train-seq.dat' - - def convert_input(self, corpus, time_slices): - """Convert corpus into LDA-C format by :class:`~gensim.corpora.bleicorpus.BleiCorpus` and save to temp file. - Path to temporary file produced by :meth:`~gensim.models.wrappers.dtmmodel.DtmModel.ftimeslices`. - - Parameters - ---------- - corpus : iterable of iterable of (int, float) - Corpus in BoW format. - time_slices : list of int - Sequence of timestamps. - - """ - logger.info("serializing temporary corpus to %s", self.fcorpustxt()) - # write out the corpus in a file format that DTM understands: - corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) - - with utils.open(self.ftimeslices(), 'wb') as fout: - fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) - for sl in time_slices: - fout.write(utils.to_utf8(str(sl) + "\n")) - - def train(self, corpus, time_slices, mode, model): - """Train DTM model. - - Parameters - ---------- - corpus : iterable of iterable of (int, int) - Collection of texts in BoW format. - time_slices : list of int - Sequence of timestamps. - mode : {'fit', 'time'}, optional - Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time - according to a DTM, basically a held out set. - model : {'fixed', 'dtm'}, optional - Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM. - - """ - self.convert_input(corpus, time_slices) - - arguments = \ - "--ntopics={p0} --model={mofrl} --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} " \ - "--outname={p4} --alpha={p5}".format( - p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, - p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha - ) - - params = \ - "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1} --lda_sequence_max_iter={p2} " \ - "--top_chain_var={p3} --rng_seed={p4} ".format( - p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, - p3=self.top_chain_var, p4=self.rng_seed - ) - - arguments = arguments + " " + params - logger.info("training DTM with args %s", arguments) - - cmd = [self.dtm_path] + arguments.split() - logger.info("Running command %s", cmd) - check_output(args=cmd, stderr=PIPE) - - self.em_steps = np.loadtxt(self.fem_steps()) - self.init_ss = np.loadtxt(self.flda_ss()) - - if self.initialize_lda: - self.init_alpha = np.loadtxt(self.finit_alpha()) - self.init_beta = np.loadtxt(self.finit_beta()) - - self.lhood_ = np.loadtxt(self.fout_liklihoods()) - - # document-topic proportions - self.gamma_ = np.loadtxt(self.fout_gamma()) - # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic - # in doc 5 - self.gamma_.shape = (self.lencorpus, self.num_topics) - # normalize proportions - self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis] - - self.lambda_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) - self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) - - for t in range(self.num_topics): - topic = "%03d" % t - self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic)) - self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic)) - # cast to correct shape, lambda[5,10,0] is the proportion of the 10th - # topic in doc 5 at time 0 - self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) - self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) - # extract document influence on topics for each time slice - # influences_time[0] , influences at time 0 - if model == 'fixed': - for k, t in enumerate(self.time_slices): - stamp = "%03d" % k - influence = np.loadtxt(self.fout_influence().format(i=stamp)) - influence.shape = (t, self.num_topics) - # influence[2,5] influence of document 2 on topic 5 - self.influences_time.append(influence) - - def print_topics(self, num_topics=10, times=5, num_words=10): - """Alias for :meth:`~gensim.models.wrappers.dtmmodel.DtmModel.show_topics`. - - Parameters - ---------- - num_topics : int, optional - Number of topics to return, set `-1` to get all topics. - times : int, optional - Number of times. - num_words : int, optional - Number of words. - - Returns - ------- - list of str - Topics as a list of strings - - """ - return self.show_topics(num_topics, times, num_words, log=True) - - def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted=True): - """Get the `num_words` most probable words for `num_topics` number of topics at 'times' time slices. - - Parameters - ---------- - num_topics : int, optional - Number of topics to return, set `-1` to get all topics. - times : int, optional - Number of times. - num_words : int, optional - Number of words. - log : bool, optional - THIS PARAMETER WILL BE IGNORED. - formatted : bool, optional - If `True` - return the topics as a list of strings, otherwise as lists of (weight, word) pairs. - - Returns - ------- - list of str - Topics as a list of strings (if formatted=True) **OR** - list of (float, str) - Topics as list of (weight, word) pairs (if formatted=False) - - """ - if num_topics < 0 or num_topics >= self.num_topics: - num_topics = self.num_topics - chosen_topics = range(num_topics) - else: - num_topics = min(num_topics, self.num_topics) - chosen_topics = range(num_topics) - - if times < 0 or times >= len(self.time_slices): - times = len(self.time_slices) - chosen_times = range(times) - else: - times = min(times, len(self.time_slices)) - chosen_times = range(times) - - shown = [] - for time in chosen_times: - for i in chosen_topics: - if formatted: - topic = self.print_topic(i, time, topn=num_words) - else: - topic = self.show_topic(i, time, topn=num_words) - shown.append(topic) - return shown - - def show_topic(self, topicid, time, topn=50, num_words=None): - """Get `num_words` most probable words for the given `topicid`. - - Parameters - ---------- - topicid : int - Id of topic. - time : int - Timestamp. - topn : int, optional - Top number of topics that you'll receive. - num_words : int, optional - DEPRECATED PARAMETER, use `topn` instead. - - Returns - ------- - list of (float, str) - Sequence of probable words, as a list of `(word_probability, word)`. - - """ - if num_words is not None: # deprecated num_words is used - warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.") - topn = num_words - - topics = self.lambda_[:, :, time] - topic = topics[topicid] - # likelihood to probability - topic = np.exp(topic) - # normalize to probability dist - topic = topic / topic.sum() - # sort according to prob - bestn = matutils.argsort(topic, topn, reverse=True) - beststr = [(topic[idx], self.id2word[idx]) for idx in bestn] - return beststr - - def print_topic(self, topicid, time, topn=10, num_words=None): - """Get the given topic, formatted as a string. - - Parameters - ---------- - topicid : int - Id of topic. - time : int - Timestamp. - topn : int, optional - Top number of topics that you'll receive. - num_words : int, optional - DEPRECATED PARAMETER, use `topn` instead. - - Returns - ------- - str - The given topic in string format, like '0.132*someword + 0.412*otherword + ...'. - - """ - if num_words is not None: # deprecated num_words is used - warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.") - topn = num_words - - return ' + '.join('%.3f*%s' % v for v in self.show_topic(topicid, time, topn=topn)) - - def dtm_vis(self, corpus, time): - """Get data specified by pyLDAvis format. - - Parameters - ---------- - corpus : iterable of iterable of (int, float) - Collection of texts in BoW format. - time : int - Sequence of timestamp. - - Notes - ----- - All of these are needed to visualise topics for DTM for a particular time-slice via pyLDAvis. - - Returns - ------- - doc_topic : numpy.ndarray - Document-topic proportions. - topic_term : numpy.ndarray - Calculated term of topic suitable for pyLDAvis format. - doc_lengths : list of int - Length of each documents in corpus. - term_frequency : numpy.ndarray - Frequency of each word from vocab. - vocab : list of str - List of words from docpus. - - """ - topic_term = np.exp(self.lambda_[:, :, time]) / np.exp(self.lambda_[:, :, time]).sum() - topic_term *= self.num_topics - - doc_topic = self.gamma_ - - doc_lengths = [len(doc) for doc_no, doc in enumerate(corpus)] - - term_frequency = np.zeros(len(self.id2word)) - for doc_no, doc in enumerate(corpus): - for pair in doc: - term_frequency[pair[0]] += pair[1] - - vocab = [self.id2word[i] for i in range(0, len(self.id2word))] - # returns numpy arrays for doc_topic proportions, topic_term proportions, and document_lengths, term_frequency. - # these should be passed to the `pyLDAvis.prepare` method to visualise one time-slice of DTM topics. - return doc_topic, topic_term, doc_lengths, term_frequency, vocab - - def dtm_coherence(self, time, num_words=20): - """Get all topics of a particular time-slice without probability values for it to be used. - For either "u_mass" or "c_v" coherence. - - Parameters - ---------- - num_words : int - Number of words. - time : int - Timestamp - - Returns - ------- - coherence_topics : list of list of str - All topics of a particular time-slice without probability values for it to be used. - - Warnings - -------- - TODO: because of print format right now can only return for 1st time-slice, should we fix the coherence - printing or make changes to the print statements to mirror DTM python? - - """ - coherence_topics = [] - for topic_no in range(0, self.num_topics): - topic = self.show_topic(topicid=topic_no, time=time, topn=num_words) - coherence_topic = [] - for prob, word in topic: - coherence_topic.append(word) - coherence_topics.append(coherence_topic) - - return coherence_topics diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py deleted file mode 100644 index a7660b2eff..0000000000 --- a/gensim/models/wrappers/ldamallet.py +++ /dev/null @@ -1,611 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2014 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -r"""Python wrapper for `Latent Dirichlet Allocation (LDA) `_ -from `MALLET, the Java topic modelling toolkit `_ - -This module allows both LDA model estimation from a training corpus and inference of topic distribution on new, -unseen documents, using an (optimized version of) collapsed gibbs sampling from MALLET. - -Notes ------ -MALLET's LDA training requires :math:`O(corpus\_words)` of memory, keeping the entire corpus in RAM. -If you find yourself running out of memory, either decrease the `workers` constructor parameter, -or use :class:`gensim.models.ldamodel.LdaModel` or :class:`gensim.models.ldamulticore.LdaMulticore` -which needs only :math:`O(1)` memory. -The wrapped model can NOT be updated with new documents for online training -- use -:class:`~gensim.models.ldamodel.LdaModel` or :class:`~gensim.models.ldamulticore.LdaMulticore` for that. - -Installation ------------- -Use `official guide `_ or this one :: - - sudo apt-get install default-jdk - sudo apt-get install ant - git clone git@github.com:mimno/Mallet.git - cd Mallet/ - ant - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.test.utils import common_corpus, common_dictionary - >>> from gensim.models.wrappers import LdaMallet - >>> - >>> path_to_mallet_binary = "/path/to/mallet/binary" - >>> model = LdaMallet(path_to_mallet_binary, corpus=common_corpus, num_topics=20, id2word=common_dictionary) - >>> vector = model[common_corpus[0]] # LDA topics of a documents - -""" - - -import logging -import os -import random -import warnings -import tempfile -import xml.etree.ElementTree as et -import zipfile -from itertools import chain - -import numpy - -from gensim import utils, matutils -from gensim.models import basemodel -from gensim.models.ldamodel import LdaModel -from gensim.utils import check_output, revdict - -logger = logging.getLogger(__name__) - - -class LdaMallet(utils.SaveLoad, basemodel.BaseTopicModel): - """Python wrapper for LDA using `MALLET `_. - - Communication between MALLET and Python takes place by passing around data files on disk - and calling Java with subprocess.call(). - - Warnings - -------- - This is **only** python wrapper for `MALLET LDA `_, - you need to install original implementation first and pass the path to binary to ``mallet_path``. - - """ - def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=None, workers=4, prefix=None, - optimize_interval=0, iterations=1000, topic_threshold=0.0, random_seed=0): - """ - - Parameters - ---------- - mallet_path : str - Path to the mallet binary, e.g. `/home/username/mallet-2.0.7/bin/mallet`. - corpus : iterable of iterable of (int, int), optional - Collection of texts in BoW format. - num_topics : int, optional - Number of topics. - alpha : int, optional - Alpha parameter of LDA. - id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional - Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`. - workers : int, optional - Number of threads that will be used for training. - prefix : str, optional - Prefix for produced temporary files. - optimize_interval : int, optional - Optimize hyperparameters every `optimize_interval` iterations - (sometimes leads to Java exception 0 to switch off hyperparameter optimization). - iterations : int, optional - Number of training iterations. - topic_threshold : float, optional - Threshold of the probability above which we consider a topic. - random_seed: int, optional - Random seed to ensure consistent results, if 0 - use system clock. - - """ - self.mallet_path = mallet_path - self.id2word = id2word - if self.id2word is None: - logger.warning("no word id mapping provided; initializing from corpus, assuming identity") - self.id2word = utils.dict_from_corpus(corpus) - self.num_terms = len(self.id2word) - else: - self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) - if self.num_terms == 0: - raise ValueError("cannot compute LDA over an empty collection (no terms)") - self.num_topics = num_topics - self.topic_threshold = topic_threshold - self.alpha = alpha - if prefix is None: - rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' - prefix = os.path.join(tempfile.gettempdir(), rand_prefix) - self.prefix = prefix - self.workers = workers - self.optimize_interval = optimize_interval - self.iterations = iterations - self.random_seed = random_seed - if corpus is not None: - self.train(corpus) - - def finferencer(self): - """Get path to inferencer.mallet file. - - Returns - ------- - str - Path to inferencer.mallet file. - - """ - return self.prefix + 'inferencer.mallet' - - def ftopickeys(self): - """Get path to topic keys text file. - - Returns - ------- - str - Path to topic keys text file. - - """ - return self.prefix + 'topickeys.txt' - - def fstate(self): - """Get path to temporary file. - - Returns - ------- - str - Path to file. - - """ - return self.prefix + 'state.mallet.gz' - - def fdoctopics(self): - """Get path to document topic text file. - - Returns - ------- - str - Path to document topic text file. - - """ - return self.prefix + 'doctopics.txt' - - def fcorpustxt(self): - """Get path to corpus text file. - - Returns - ------- - str - Path to corpus text file. - - """ - return self.prefix + 'corpus.txt' - - def fcorpusmallet(self): - """Get path to corpus.mallet file. - - Returns - ------- - str - Path to corpus.mallet file. - - """ - return self.prefix + 'corpus.mallet' - - def fwordweights(self): - """Get path to word weight file. - - Returns - ------- - str - Path to word weight file. - - """ - return self.prefix + 'wordweights.txt' - - def corpus2mallet(self, corpus, file_like): - """Convert `corpus` to Mallet format and write it to `file_like` descriptor. - - Format :: - - document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens[NEWLINE] - - Parameters - ---------- - corpus : iterable of iterable of (int, int) - Collection of texts in BoW format. - file_like : file-like object - Opened file. - - """ - for docno, doc in enumerate(corpus): - if self.id2word: - tokens = chain.from_iterable([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc) - else: - tokens = chain.from_iterable([str(tokenid)] * int(cnt) for tokenid, cnt in doc) - file_like.write(utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens)))) - - def convert_input(self, corpus, infer=False, serialize_corpus=True): - """Convert corpus to Mallet format and save it to a temporary text file. - - Parameters - ---------- - corpus : iterable of iterable of (int, int) - Collection of texts in BoW format. - infer : bool, optional - ... - serialize_corpus : bool, optional - ... - - """ - if serialize_corpus: - logger.info("serializing temporary corpus to %s", self.fcorpustxt()) - with utils.open(self.fcorpustxt(), 'wb') as fout: - self.corpus2mallet(corpus, fout) - - # convert the text file above into MALLET's internal format - cmd = \ - self.mallet_path + \ - " import-file --preserve-case --keep-sequence " \ - "--remove-stopwords --token-regex \"\\S+\" --input %s --output %s" - if infer: - cmd += ' --use-pipe-from ' + self.fcorpusmallet() - cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') - else: - cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) - logger.info("converting temporary corpus to MALLET format with %s", cmd) - check_output(args=cmd, shell=True) - - def train(self, corpus): - """Train Mallet LDA. - - Parameters - ---------- - corpus : iterable of iterable of (int, int) - Corpus in BoW format - - """ - self.convert_input(corpus, infer=False) - cmd = self.mallet_path + ' train-topics --input %s --num-topics %s --alpha %s --optimize-interval %s '\ - '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s '\ - '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s --random-seed %s' - - cmd = cmd % ( - self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, - self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, - self.finferencer(), self.topic_threshold, str(self.random_seed) - ) - # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory - logger.info("training MALLET LDA with %s", cmd) - check_output(args=cmd, shell=True) - self.word_topics = self.load_word_topics() - # NOTE - we are still keeping the wordtopics variable to not break backward compatibility. - # word_topics has replaced wordtopics throughout the code; - # wordtopics just stores the values of word_topics when train is called. - self.wordtopics = self.word_topics - - def __getitem__(self, bow, iterations=100): - """Get vector for document(s). - - Parameters - ---------- - bow : {list of (int, int), iterable of list of (int, int)} - Document (or corpus) in BoW format. - iterations : int, optional - Number of iterations that will be used for inferring. - - Returns - ------- - list of (int, float) - LDA vector for document as sequence of (topic_id, topic_probability) **OR** - list of list of (int, float) - LDA vectors for corpus in same format. - - """ - is_corpus, corpus = utils.is_corpus(bow) - if not is_corpus: - # query is a single document => make a corpus out of it - bow = [bow] - - self.convert_input(bow, infer=True) - cmd = \ - self.mallet_path + ' infer-topics --input %s --inferencer %s ' \ - '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s --random-seed %s' - cmd = cmd % ( - self.fcorpusmallet() + '.infer', self.finferencer(), - self.fdoctopics() + '.infer', iterations, self.topic_threshold, str(self.random_seed) - ) - logger.info("inferring topics with MALLET LDA '%s'", cmd) - check_output(args=cmd, shell=True) - result = list(self.read_doctopics(self.fdoctopics() + '.infer')) - return result if is_corpus else result[0] - - def load_word_topics(self): - """Load words X topics matrix from :meth:`gensim.models.wrappers.ldamallet.LdaMallet.fstate` file. - - Returns - ------- - numpy.ndarray - Matrix words X topics. - - """ - logger.info("loading assigned topics from %s", self.fstate()) - word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float64) - if hasattr(self.id2word, 'token2id'): - word2id = self.id2word.token2id - else: - word2id = revdict(self.id2word) - - with utils.open(self.fstate(), 'rb') as fin: - _ = next(fin) # header - self.alpha = numpy.fromiter(next(fin).split()[2:], dtype=float) - assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" - _ = next(fin) # noqa:F841 beta - for lineno, line in enumerate(fin): - line = utils.to_unicode(line) - doc, source, pos, typeindex, token, topic = line.split(" ") - if token not in word2id: - continue - tokenid = word2id[token] - word_topics[int(topic), tokenid] += 1.0 - return word_topics - - def load_document_topics(self): - """Load document topics from :meth:`gensim.models.wrappers.ldamallet.LdaMallet.fdoctopics` file. - Shortcut for :meth:`gensim.models.wrappers.ldamallet.LdaMallet.read_doctopics`. - - Returns - ------- - iterator of list of (int, float) - Sequence of LDA vectors for documents. - - """ - return self.read_doctopics(self.fdoctopics()) - - def get_topics(self): - """Get topics X words matrix. - - Returns - ------- - numpy.ndarray - Topics X words matrix, shape `num_topics` x `vocabulary_size`. - - """ - topics = self.word_topics - return topics / topics.sum(axis=1)[:, None] - - def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): - """Get the `num_words` most probable words for `num_topics` number of topics. - - Parameters - ---------- - num_topics : int, optional - Number of topics to return, set `-1` to get all topics. - num_words : int, optional - Number of words. - log : bool, optional - If True - write topic with logging too, used for debug proposes. - formatted : bool, optional - If `True` - return the topics as a list of strings, otherwise as lists of (weight, word) pairs. - - Returns - ------- - list of str - Topics as a list of strings (if formatted=True) **OR** - list of (float, str) - Topics as list of (weight, word) pairs (if formatted=False) - - """ - if num_topics < 0 or num_topics >= self.num_topics: - num_topics = self.num_topics - chosen_topics = range(num_topics) - else: - num_topics = min(num_topics, self.num_topics) - # add a little random jitter, to randomize results around the same alpha - sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha)) - sorted_topics = list(matutils.argsort(sort_alpha)) - chosen_topics = sorted_topics[: num_topics // 2] + sorted_topics[-num_topics // 2:] - shown = [] - for i in chosen_topics: - if formatted: - topic = self.print_topic(i, topn=num_words) - else: - topic = self.show_topic(i, topn=num_words) - shown.append((i, topic)) - if log: - logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic) - return shown - - def show_topic(self, topicid, topn=10, num_words=None): - """Get `num_words` most probable words for the given `topicid`. - - Parameters - ---------- - topicid : int - Id of topic. - topn : int, optional - Top number of topics that you'll receive. - num_words : int, optional - DEPRECATED PARAMETER, use `topn` instead. - - Returns - ------- - list of (str, float) - Sequence of probable words, as a list of `(word, word_probability)` for `topicid` topic. - - """ - if num_words is not None: # deprecated num_words is used - warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.") - topn = num_words - - if self.word_topics is None: - logger.warning("Run train or load_word_topics before showing topics.") - topic = self.word_topics[topicid] - topic = topic / topic.sum() # normalize to probability dist - bestn = matutils.argsort(topic, topn, reverse=True) - beststr = [(self.id2word[idx], topic[idx]) for idx in bestn] - return beststr - - def get_version(self, direc_path): - """"Get the version of Mallet. - - Parameters - ---------- - direc_path : str - Path to mallet archive. - - Returns - ------- - str - Version of mallet. - - """ - try: - archive = zipfile.ZipFile(direc_path, 'r') - if u'cc/mallet/regression/' not in archive.namelist(): - return '2.0.7' - else: - return '2.0.8RC3' - except Exception: - - xml_path = direc_path.split("bin")[0] - try: - doc = et.parse(xml_path + "pom.xml").getroot() - namespace = doc.tag[:doc.tag.index('}') + 1] - return doc.find(namespace + 'version').text.split("-")[0] - except Exception: - return "Can't parse pom.xml version file" - - def read_doctopics(self, fname, eps=1e-6, renorm=True): - """Get document topic vectors from MALLET's "doc-topics" format, as sparse gensim vectors. - - Parameters - ---------- - fname : str - Path to input file with document topics. - eps : float, optional - Threshold for probabilities. - renorm : bool, optional - If True - explicitly re-normalize distribution. - - Raises - ------ - RuntimeError - If any line in invalid format. - - Yields - ------ - list of (int, float) - LDA vectors for document. - - """ - mallet_version = self.get_version(self.mallet_path) - with utils.open(fname, 'rb') as fin: - for lineno, line in enumerate(fin): - if lineno == 0 and line.startswith(b"#doc "): - continue # skip the header line if it exists - - parts = line.split()[2:] # skip "doc" and "source" columns - - # the MALLET doctopic format changed in 2.0.8 to exclude the id, - # this handles the file differently dependent on the pattern - if len(parts) == 2 * self.num_topics: - doc = [ - (int(id_), float(weight)) for id_, weight in zip(*[iter(parts)] * 2) - if abs(float(weight)) > eps - ] - elif len(parts) == self.num_topics and mallet_version != '2.0.7': - doc = [(id_, float(weight)) for id_, weight in enumerate(parts) if abs(float(weight)) > eps] - else: - if mallet_version == "2.0.7": - """ - - 1 1 0 1.0780612802674239 30.005575655428533364 2 0.005575655428533364 - 2 2 0 0.9184413079632608 40.009062076892971008 3 0.009062076892971008 - In the above example there is a mix of the above if and elif statement. - There are neither `2*num_topics` nor `num_topics` elements. - It has 2 formats 40.009062076892971008 and 0 1.0780612802674239 - which cannot be handled by above if elif. - Also, there are some topics are missing(meaning that the topic is not there) - which is another reason why the above if elif fails even when the `mallet` - produces the right results - - """ - count = 0 - doc = [] - if len(parts) > 0: - while count < len(parts): - """ - if section is to deal with formats of type 2 0.034 - so if count reaches index of 2 and since int(2) == float(2) so if block is executed - now there is one extra element afer 2, so count + 1 access should not give an error - - else section handles formats of type 20.034 - now count is there on index of 20.034 since float(20.034) != int(20.034) so else block - is executed - - """ - if float(parts[count]) == int(parts[count]): - if float(parts[count + 1]) > eps: - doc.append((int(parts[count]), float(parts[count + 1]))) - count += 2 - else: - if float(parts[count]) - int(parts[count]) > eps: - doc.append((int(parts[count]) % 10, float(parts[count]) - int(parts[count]))) - count += 1 - else: - raise RuntimeError("invalid doc topics format at line %i in %s" % (lineno + 1, fname)) - - if renorm: - # explicitly normalize weights to sum up to 1.0, just to be sure... - total_weight = float(sum(weight for _, weight in doc)) - if total_weight: - doc = [(id_, float(weight) / total_weight) for id_, weight in doc] - yield doc - - @classmethod - def load(cls, *args, **kwargs): - """Load a previously saved LdaMallet class. Handles backwards compatibility from - older LdaMallet versions which did not use random_seed parameter. - """ - model = super(LdaMallet, cls).load(*args, **kwargs) - if not hasattr(model, 'random_seed'): - model.random_seed = 0 - - return model - - -def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): - """Convert :class:`~gensim.models.wrappers.ldamallet.LdaMallet` to :class:`~gensim.models.ldamodel.LdaModel`. - - This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model. - - Parameters - ---------- - mallet_model : :class:`~gensim.models.wrappers.ldamallet.LdaMallet` - Trained Mallet model - gamma_threshold : float, optional - To be used for inference in the new LdaModel. - iterations : int, optional - Number of iterations to be used for inference in the new LdaModel. - - Returns - ------- - :class:`~gensim.models.ldamodel.LdaModel` - Gensim native LDA. - - """ - model_gensim = LdaModel( - id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, - alpha=mallet_model.alpha, eta=0, - iterations=iterations, - gamma_threshold=gamma_threshold, - dtype=numpy.float64 # don't loose precision when converting from MALLET - ) - model_gensim.state.sstats[...] = mallet_model.wordtopics - model_gensim.sync_state() - return model_gensim diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py deleted file mode 100644 index f7c286a349..0000000000 --- a/gensim/models/wrappers/ldavowpalwabbit.py +++ /dev/null @@ -1,888 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2015 Dave Challis -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Python wrapper for `Vowpal Wabbit's Latent Dirichlet Allocation `_. - -This uses `Matt Hoffman's online algorithm -`_, i.e. the same algorithm -that Gensim's :class:`~gensim.models.ldamodel.LdaModel` is based on. - -Installation ------------- -Use `official guide `_ or this one :: - - git clone https://github.com/JohnLangford/vowpal_wabbit.git - cd vowpal_wabbit - make - make test - sudo make install - -Warnings --------- -Currently working and tested with Vowpal Wabbit versions 7.10 to 8.1.1. Vowpal Wabbit's API isn't currently stable, -so this may or may not work with older/newer versions. The aim will be to ensure this wrapper always works with -the latest release of Vowpal Wabbit. - - -Examples --------- - -Train model - -.. sourcecode:: pycon - - >>> from gensim.test.utils import common_corpus, common_dictionary - >>> from gensim.models.wrappers import LdaVowpalWabbit - >>> - >>> path_to_wv_binary = "/path/to/vw/binary" - >>> model = LdaVowpalWabbit(path_to_wv_binary, corpus=common_corpus, num_topics=20, id2word=common_dictionary) - -Update existing model - -.. sourcecode:: pycon - - >>> another_corpus = [[(1, 1), (2, 1)], [(3, 5)]] - >>> model.update(another_corpus) - -Get topic probability distributions for a document - -.. sourcecode:: pycon - - >>> document_bow = [(1, 1)] - >>> print(model[document_bow]) - -Print topics - -.. sourcecode:: pycon - - >>> print(model.print_topics()) - -Save/load the trained model - -.. sourcecode:: pycon - - >>> from gensim.test.utils import get_tmpfile - >>> - >>> temp_path = get_tmpfile("vw_lda.model") - >>> model.save(temp_path) - >>> - >>> loaded_lda = LdaVowpalWabbit.load(temp_path) - -Calculate log-perplexoty on given corpus - -.. sourcecode:: pycon - - >>> another_corpus = [[(1, 1), (2, 1)], [(3, 5)]] - >>> print(model.log_perpexity(another_corpus)) - -Vowpal Wabbit works on files, so this wrapper maintains a temporary directory while it's around, -reading/writing there as necessary. - -""" -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import logging -import os -import shutil -import subprocess -import tempfile - -import numpy - -from gensim import utils, matutils -from gensim.models.ldamodel import LdaModel - -logger = logging.getLogger(__name__) - - -class LdaVowpalWabbit(utils.SaveLoad): - """Python wrapper using `Vowpal Wabbit's online LDA `_. - - Communication between Vowpal Wabbit and Python takes place by passing around data files - on disk and calling the 'vw' binary with the subprocess module. - - Warnings - -------- - This is **only** python wrapper for `Vowpal Wabbit's online LDA `_, - you need to install original implementation first and pass the path to binary to ``vw_path``. - - """ - def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None, - chunksize=256, passes=1, alpha=0.1, eta=0.1, decay=0.5, - offset=1, gamma_threshold=0.001, random_seed=None, - cleanup_files=True, tmp_prefix='tmp'): - """ - - Parameters - ---------- - vw_path : str - Path to Vowpal Wabbit's binary. - corpus : iterable of list of (int, int), optional - Collection of texts in BoW format. If given, training will start immediately, - otherwise, you should call :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.train` or - :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.update` manually for training. - num_topics : int, optional - Number of requested latent topics to be extracted from the training corpus. - Corresponds to VW's ``--lda `` argument. - id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional - Mapping from word ids (integers) to words (strings). - chunksize : int, optional - Number of documents examined in each batch. - Corresponds to VW's ``--minibatch `` argument. - passes : int, optional - Number of passes over the dataset to use. - Corresponds to VW's ``--passes `` argument. - alpha : float, optional - Float effecting sparsity of per-document topic weights. - This is applied symmetrically, and should be set higher to when documents are thought to look more similar. - Corresponds to VW's ``--lda_alpha `` argument. - eta : float, optional - Affects the sparsity of topic distributions. - This is applied symmetrically, and should be set higher when topics - are thought to look more similar. - Corresponds to VW's ``--lda_rho `` argument. - decay : float, optional - Learning rate decay, affects how quickly learnt values are forgotten. - Should be set to a value between 0.5 and 1.0 to guarantee convergence. - Corresponds to VW's ``--power_t `` argument. - offset: int, optional - Learning offset, set to higher values to slow down learning on early iterations of the algorithm. - Corresponds to VW's ``--initial_t `` argument. - gamma_threshold : float, optional - Affects when learning loop will be broken out of, higher values will result in earlier loop completion. - Corresponds to VW's ``--epsilon `` argument. - random_seed : int, optional - Sets random seed when learning. - Corresponds to VW's ``--random_seed `` argument. - cleanup_files : bool, optional - Whether or not to delete temporary directory and files used by this wrapper. - Setting to False can be useful for debugging, or for re-using Vowpal Wabbit files elsewhere. - tmp_prefix : str, optional - To prefix temporary working directory name. - - """ - # default parameters are taken from Vowpal Wabbit's defaults, and - # parameter names changed to match Gensim's LdaModel where possible - self.vw_path = vw_path - self.id2word = id2word - - if self.id2word is None: - if corpus is None: - raise ValueError( - "at least one of corpus/id2word must be specified, to establish input space dimensionality" - ) - logger.warning("no word id mapping provided; initializing from corpus, assuming identity") - self.id2word = utils.dict_from_corpus(corpus) - self.num_terms = len(self.id2word) - elif len(self.id2word) > 0: - self.num_terms = 1 + max(self.id2word.keys()) - else: - self.num_terms = 0 - - if self.num_terms == 0: - raise ValueError("cannot compute LDA over an empty collection (no terms)") - - # LDA parameters - self.num_topics = num_topics - self.chunksize = chunksize - self.passes = passes - self.alpha = alpha - self.eta = eta - self.gamma_threshold = gamma_threshold - self.offset = offset - self.decay = decay - self.random_seed = random_seed - self._initial_offset = offset - - # temporary files used for Vowpal Wabbit input/output - self.tmp_dir = None - self.tmp_prefix = tmp_prefix - self.cleanup_files = cleanup_files - self._init_temp_dir(tmp_prefix) - - # used for saving/loading this model's state - self._model_data = None - self._topics_data = None - - # cache loaded topics as numpy array - self._topics = None - - if corpus is not None: - self.train(corpus) - - def train(self, corpus): - """Clear any existing model state, and train on given `corpus`. - - Parameters - ---------- - corpus : iterable of list of (int, int) - Collection of texts in BoW format. - - """ - logger.debug('Training new model from corpus') - - # reset any existing offset, model, or topics generated - self.offset = self._initial_offset - self._topics = None - - corpus_size = write_corpus_as_vw(corpus, self._corpus_filename) - - cmd = self._get_vw_train_command(corpus_size) - - _run_vw_command(cmd) - - # ensure that future updates of this model use correct offset - self.offset += corpus_size - - def update(self, corpus): - """Update existing model with `corpus`. - - Parameters - ---------- - corpus : iterable of list of (int, int) - Collection of texts in BoW format. - - """ - if not os.path.exists(self._model_filename): - return self.train(corpus) - - logger.debug('Updating exiting model from corpus') - - # reset any existing topics generated - self._topics = None - - corpus_size = write_corpus_as_vw(corpus, self._corpus_filename) - - cmd = self._get_vw_update_command(corpus_size) - - _run_vw_command(cmd) - - # ensure that future updates of this model use correct offset - self.offset += corpus_size - - def log_perplexity(self, chunk): - """Get per-word lower bound on log perplexity. - - Parameters - ---------- - chunk : iterable of list of (int, int) - Collection of texts in BoW format. - - Returns - ------- - bound : float - Per-word lower bound on log perplexity. - - """ - vw_data = self._predict(chunk)[1] - corpus_words = sum(cnt for document in chunk for _, cnt in document) - bound = -vw_data['average_loss'] - logger.info( - "%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words", - bound, numpy.exp2(-bound), vw_data['corpus_size'], corpus_words - ) - return bound - - def get_topics(self): - """Get topics X words matrix. - - Returns - ------- - numpy.ndarray - `num_topics` x `vocabulary_size` array of floats which represents the learned term topic matrix. - - """ - topics = self._get_topics() - return topics / topics.sum(axis=1)[:, None] - - def print_topics(self, num_topics=10, num_words=10): - """Alias for :meth:`~gensim.models.wrappers.dtmmodel.DtmModel.show_topics`. - - Parameters - ---------- - num_topics : int, optional - Number of topics to return, set `-1` to get all topics. - num_words : int, optional - Number of words. - - Returns - ------- - list of str - Topics as a list of strings - - """ - return self.show_topics(num_topics, num_words, log=True) - - def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): - """Get the `num_words` most probable words for `num_topics` number of topics. - - Parameters - ---------- - num_topics : int, optional - Number of topics to return, set `-1` to get all topics. - num_words : int, optional - Number of words. - log : bool, optional - If True - will write topics with logger. - formatted : bool, optional - If `True` - return the topics as a list of strings, otherwise as lists of (weight, word) pairs. - - Returns - ------- - list of str - Topics as a list of strings (if formatted=True) **OR** - list of (float, str) - Topics as list of (weight, word) pairs (if formatted=False) - - """ - if num_topics < 0 or num_topics >= self.num_topics: - num_topics = self.num_topics - else: - num_topics = min(num_topics, self.num_topics) - - chosen_topics = range(num_topics) - shown = [] - - for i in chosen_topics: - if formatted: - topic = self.print_topic(i, topn=num_words) - else: - topic = self.show_topic(i, topn=num_words) - - shown.append(topic) - - if log: - logger.info("topic #%i (%.3f): %s", i, self.alpha, topic) - - return shown - - def print_topic(self, topicid, topn=10): - """Get text representation of topic. - - Parameters - ---------- - topicid : int - Id of topic. - topn : int, optional - Top number of words in topic. - - Returns - ------- - str - Topic `topicid` in text representation. - - """ - return ' + '.join('{0:.3f}*{1}'.format(v[0], v[1]) for v in self.show_topic(topicid, topn)) - - def show_topic(self, topicid, topn=10): - """Get `num_words` most probable words for the given `topicid`. - - Parameters - ---------- - topicid : int - Id of topic. - topn : int, optional - Top number of topics that you'll receive. - - Returns - ------- - list of (str, float) - Sequence of probable words, as a list of `(word, word_probability)` for `topicid` topic. - - """ - topics = self._get_topics() - topic = topics[topicid] - bestn = matutils.argsort(topic, topn, reverse=True) - return [(topic[t_id], self.id2word[t_id]) for t_id in bestn] - - def save(self, fname, *args, **kwargs): - """Save model to file. - - Parameters - ---------- - fname : str - Path to output file. - - """ - if os.path.exists(self._model_filename): - # Vowpal Wabbit uses its own binary model file, read this into - # variable before serialising this object - keeps all data - # self contained within a single serialised file - logger.debug("Reading model bytes from '%s'", self._model_filename) - with utils.open(self._model_filename, 'rb') as fhandle: - self._model_data = fhandle.read() - - if os.path.exists(self._topics_filename): - logger.debug("Reading topic bytes from '%s'", self._topics_filename) - with utils.open(self._topics_filename, 'rb') as fhandle: - self._topics_data = fhandle.read() - - if 'ignore' not in kwargs: - kwargs['ignore'] = frozenset(['_topics', 'tmp_dir']) - - super(LdaVowpalWabbit, self).save(fname, *args, **kwargs) - - @classmethod - def load(cls, fname, *args, **kwargs): - """Load model from `fname`. - - Parameters - ---------- - fname : str - Path to file with :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`. - - """ - lda_vw = super(LdaVowpalWabbit, cls).load(fname, *args, **kwargs) - lda_vw._init_temp_dir(prefix=lda_vw.tmp_prefix) - - if lda_vw._model_data: - # Vowpal Wabbit operates on its own binary model file - deserialise - # to file at load time, making it immediately ready for use - logger.debug("Writing model bytes to '%s'", lda_vw._model_filename) - with utils.open(lda_vw._model_filename, 'wb') as fhandle: - fhandle.write(lda_vw._model_data) - lda_vw._model_data = None # no need to keep in memory after this - - if lda_vw._topics_data: - logger.debug("Writing topic bytes to '%s'", lda_vw._topics_filename) - with utils.open(lda_vw._topics_filename, 'wb') as fhandle: - fhandle.write(lda_vw._topics_data) - lda_vw._topics_data = None - - return lda_vw - - def __del__(self): - """Cleanup the temporary directory used by this wrapper.""" - if self.cleanup_files and self.tmp_dir: - logger.debug("Recursively deleting: %s", self.tmp_dir) - shutil.rmtree(self.tmp_dir) - - def _init_temp_dir(self, prefix='tmp'): - """Create a working temporary directory with given prefix. - - Parameters - ---------- - prefix : str - Prefix of the temporary directory. - - """ - self.tmp_dir = tempfile.mkdtemp(prefix=prefix) - logger.info('using %s as temp dir', self.tmp_dir) - - def _get_vw_predict_command(self, corpus_size): - """Get list of command line arguments for running prediction. - - Parameters - ---------- - corpus_size : int - Size of the corpus. - - """ - cmd = [ - self.vw_path, - '--testonly', # don't update model with this data - '--lda_D', str(corpus_size), - '-i', self._model_filename, # load existing binary model - '-d', self._corpus_filename, - '--learning_rate', '0', # possibly not needed, but harmless - '-p', self._predict_filename - ] - - if self.random_seed is not None: - cmd.extend(['--random_seed', str(self.random_seed)]) - - return cmd - - def _get_vw_train_command(self, corpus_size, update=False): - """Get list of command line arguments for running model training. - - Parameters - ---------- - corpus_size : int - Size of corpus. - update : bool - Set `True` to further train an existing model. - - Returns - ------- - list of str - Sequence of all training parameters. - - """ - cmd = [ - self.vw_path, - '-d', self._corpus_filename, - '--power_t', str(self.decay), - '--initial_t', str(self.offset), - '--minibatch', str(self.chunksize), - '--lda_D', str(corpus_size), - '--passes', str(self.passes), - '--cache_file', self._cache_filename, - '--lda_epsilon', str(self.gamma_threshold), - '--readable_model', self._topics_filename, - '-k', # clear cache - '-f', self._model_filename - ] - - if update: - cmd.extend(['-i', self._model_filename]) - else: - # these params are read from model file if updating - cmd.extend([ - '--lda', str(self.num_topics), - '-b', str(self.num_terms.bit_length()), - '--lda_alpha', str(self.alpha), - '--lda_rho', str(self.eta) - ]) - - if self.random_seed is not None: - cmd.extend(['--random_seed', str(self.random_seed)]) - - return cmd - - def _get_vw_update_command(self, corpus_size): - """Get list of command line arguments to update a model. - Alias for :meth:`~gensim.models.wrappers.dtmmodel.DtmModel._get_vw_train_command` - - Parameters - ---------- - corpus_size : int - Size of the corpus. - - Returns - ------- - list of str - Sequence of all training parameters. - - """ - return self._get_vw_train_command(corpus_size, update=True) - - def _load_vw_topics(self): - """Read topics file generated by Vowpal Wabbit, convert to numpy array.""" - topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) - - with utils.open(self._topics_filename, 'rb') as topics_file: - found_data = False - - for line in topics_file: - # look for start of data - if not found_data: - if line.startswith(b'0 ') and b':' not in line: - found_data = True - else: - continue - - fields = line.split() - word_id = int(fields[0]) - - # output contains entries for 2**b terms, where b was set - # by the '-b' option, ignore anything past num_terms - if word_id >= self.num_terms: - break - - topics[:, word_id] = fields[1:] - - # normalise to probability distribution - self._topics = topics / topics.sum(axis=1, keepdims=True) - - def _get_topics(self): - """Get topics matrix, load from file if necessary.""" - if self._topics is None: - self._load_vw_topics() - return self._topics - - def _predict(self, chunk): - """Run given chunk of documents against currently trained model. - - Parameters - ---------- - chunk : iterable of list of (int, int) - Sequence of documents in BoW format. - - Returns - ------- - predictions : ndarray - Tuple of prediction matrix. - vw_data : dict - Vowpal Wabbit data. - - """ - corpus_size = write_corpus_as_vw(chunk, self._corpus_filename) - - cmd = self._get_vw_predict_command(corpus_size) - vw_data = _parse_vw_output(_run_vw_command(cmd)) - vw_data['corpus_size'] = corpus_size - - predictions = numpy.zeros((corpus_size, self.num_topics), dtype=numpy.float32) - - with utils.open(self._predict_filename, 'rb') as fhandle: - for i, line in enumerate(fhandle): - predictions[i, :] = line.split() - - predictions = predictions / predictions.sum(axis=1, keepdims=True) - - return predictions, vw_data - - def __getitem__(self, bow, eps=0.01): - """Convert document or corpus in BoW format to LDA vectors in BoW format - - Parameters - ---------- - bow : {list of (int, int), iterable of list of (int, int)} - Document or corpus in BoW format. - eps : float - Threshold value (all topics with probability < `eps` will be ignored. - - Returns - ------- - list of (int, float) - LDA vector for document **OR** - list of list of (int, float) - LDA vectors for corpus. - - """ - is_corpus, dummy_corpus = utils.is_corpus(bow) - if not is_corpus: - bow = [bow] - - predictions = self._predict(bow)[0] - - topics = [] - for row in predictions: - row_topics = [] - for topic_id, val in enumerate(row): - if val > eps: - row_topics.append((topic_id, val)) - topics.append(row_topics) - - return topics if is_corpus else topics[0] - - def _get_filename(self, name): - """Get path to given filename in temp directory. - - Parameters - ---------- - name : str - Name of the file. - - Returns - ------- - str - Path to a file. - - """ - return os.path.join(self.tmp_dir, name) - - @property - def _model_filename(self): - """Get path to file to write Vowpal Wabbit model to. - - Returns - ------- - str - Path to file to write Vowpal Wabbit model to. - - """ - return self._get_filename('model.vw') - - @property - def _cache_filename(self): - """Get path to file to write Vowpal Wabbit cache to. - - Returns - ------- - str - Path to file to write Vowpal Wabbit cache to. - - """ - return self._get_filename('cache.vw') - - @property - def _corpus_filename(self): - """Get path to file to write Vowpal Wabbit corpus to. - - Returns - ------- - str - Path to file to write Vowpal Wabbit corpus to. - - """ - return self._get_filename('corpus.vw') - - @property - def _topics_filename(self): - """Get path to file to write Vowpal Wabbit topics to. - - Returns - ------- - str - Path to file to write Vowpal Wabbit topics to. - - """ - return self._get_filename('topics.vw') - - @property - def _predict_filename(self): - """Get path to file to write Vowpal Wabbit predictions to. - - Returns - ------- - str - Path to file to write Vowpal Wabbit predictions to. - - """ - return self._get_filename('predict.vw') - - def __str__(self): - """Get text representation of model.""" - fields = ['num_terms', 'num_topics', 'chunksize', 'alpha', 'eta'] - kv = ["{0}={1}".format(field, getattr(self, field)) for field in fields] - return "{0}({1})".format(self.__class__.__name__, ', '.join(kv)) - - -def corpus_to_vw(corpus): - """Convert corpus to Vowpal Wabbit format. - - Parameters - ---------- - corpus : iterable of list of (int, int) - Collection of texts in BoW format. - - - Notes - ----- - - Vowpal Wabbit format :: - - | 4:7 14:1 22:8 6:3 - | 14:22 22:4 0:1 1:3 - | 7:2 8:2 - - - Yields - ------ - str - Corpus in Vowpal Wabbit, line by line. - - """ - for entries in corpus: - line = ['|'] - for word_id, count in entries: - line.append("{0}:{1}".format(word_id, count)) - yield ' '.join(line) - - -def write_corpus_as_vw(corpus, filename): - """Covert `corpus` to Vowpal Wabbit format and save it to `filename`. - - Parameters - ---------- - corpus : iterable of list of (int, int) - Collection of texts in BoW format. - filename : str - Path to output file. - - Returns - ------- - int - Number of lines in `filename`. - - """ - logger.debug("Writing corpus to: %s", filename) - - corpus_size = 0 - with utils.open(filename, 'wb') as corpus_file: - for line in corpus_to_vw(corpus): - corpus_file.write(line.encode('utf-8') + b'\n') - corpus_size += 1 - - return corpus_size - - -def _parse_vw_output(text): - """Get dict of useful fields from Vowpal Wabbit's output. - - Parameters - ---------- - text : str - Text from vw file. - - Returns - ------- - dict of (str, float) - Dictionary with field "average_loss", lower bound on mean per-word log-perplexity. - - """ - data = {} - for line in text.splitlines(): - if line.startswith('average loss'): - data['average_loss'] = float(line.split('=')[1]) - break - - return data - - -def _run_vw_command(cmd): - """Execute given Vowpal Wabbit command, log stdout and stderr. - - Parameters - ---------- - cmd : str - Given Vowpal Wabbit command to execute. - - Returns - ------- - str - Stdout and stderr. - - Raises - ------ - subprocess.CalledProcessError - If something goes wrong. - - """ - logger.info("Running Vowpal Wabbit command: %s", ' '.join(cmd)) - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - output = proc.communicate()[0].decode('utf-8') - logger.debug("Vowpal Wabbit output: %s", output) - - if proc.returncode != 0: - raise subprocess.CalledProcessError(proc.returncode, ' '.join(cmd), output=output) - - return output - - -def vwmodel2ldamodel(vw_model, iterations=50): - """Convert :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` to - :class:`~gensim.models.ldamodel.LdaModel`. - - This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel - into the gensim model. - - Parameters - ---------- - vw_model : :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` - Trained Vowpal Wabbit model. - iterations : int - Number of iterations to be used for inference of the new :class:`~gensim.models.ldamodel.LdaModel`. - - Returns - ------- - :class:`~gensim.models.ldamodel.LdaModel`. - Gensim native LDA. - - """ - model_gensim = LdaModel( - num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, - passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay, - offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold, - dtype=numpy.float32 - ) - model_gensim.expElogbeta[:] = vw_model._get_topics() - return model_gensim diff --git a/gensim/models/wrappers/varembed.py b/gensim/models/wrappers/varembed.py deleted file mode 100644 index cf76dbe13e..0000000000 --- a/gensim/models/wrappers/varembed.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright (C) 2017 Anmol Gulati -# Copyright (C) 2017 Radim Rehurek - -"""Python wrapper around `Varembed model `_. -Original paper:`"Morphological Priors for Probabilistic Neural Word Embeddings" `_. - -Notes ------ -* This module allows ability to obtain word vectors for out-of-vocabulary words, for the Varembed model. -* The wrapped model can not be updated with new documents for online training. - -""" - -import logging -import numpy as np - -from gensim import utils -from gensim.models.keyedvectors import KeyedVectors - -logger = logging.getLogger(__name__) - - -class VarEmbed(KeyedVectors): - """Python wrapper using `Varembed `_. - - Warnings - -------- - This is **only** python wrapper for `Varembed `_, - this allows to load pre-trained models only. - - """ - def __init__(self): - super(VarEmbed, self).__init__(vector_size=0) - self.vocab_size = 0 - - @classmethod - def load_varembed_format(cls, vectors, morfessor_model=None): - """Load the word vectors into matrix from the varembed output vector files. - - Parameters - ---------- - vectors : dict - Pickle file containing the word vectors. - morfessor_model : str, optional - Path to the trained morfessor model. - - Returns - ------- - :class:`~gensim.models.wrappers.varembed.VarEmbed` - Ready to use instance. - - """ - result = cls() - if vectors is None: - raise Exception("Please provide vectors binary to load varembed model") - d = utils.unpickle(vectors) - word_to_ix = d['word_to_ix'] - morpho_to_ix = d['morpho_to_ix'] - word_embeddings = d['word_embeddings'] - morpho_embeddings = d['morpheme_embeddings'] - result.load_word_embeddings(word_embeddings, word_to_ix) - if morfessor_model: - try: - import morfessor - morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model) - result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix) - except ImportError: - # Morfessor Package not found. - logger.error('Could not import morfessor. Not using morpheme embeddings') - raise ImportError('Could not import morfessor.') - - logger.info('Loaded varembed model vectors from %s', vectors) - return result - - def load_word_embeddings(self, word_embeddings, word_to_ix): - """Loads the word embeddings. - - Parameters - ---------- - word_embeddings : numpy.ndarray - Matrix with word-embeddings. - word_to_ix : dict of (str, int) - Mapping word to index. - - """ - logger.info("Loading the vocabulary") - self.key_to_index = {} - self.index_to_key = [] - counts = {} - for word in word_to_ix: - counts[word] = counts.get(word, 0) + 1 - self.vocab_size = len(counts) - self.vector_size = word_embeddings.shape[1] - self.vectors = np.zeros((self.vocab_size, self.vector_size)) - self.index_to_key = [None] * self.vocab_size - logger.info("Corpus has %i words", len(self)) - for word_id, word in enumerate(counts): - self.index_to_key[word_id] = word - self.key_to_index[word] = word_id - self.set_vecattr(word, 'count', counts[word]) - self.vectors[word_id] = word_embeddings[word_to_ix[word]] - assert((len(self.key_to_index), self.vector_size) == self.vectors.shape) - logger.info("Loaded matrix of %d size and %d dimensions", self.vocab_size, self.vector_size) - - def add_morphemes_to_embeddings(self, morfessor_model, morpho_embeddings, morpho_to_ix): - """Include morpheme embeddings into vectors. - - Parameters - ---------- - morfessor_model : :class:`morfessor.baseline.BaselineModel` - Morfessor model. - morpho_embeddings : dict - Pickle file containing morpheme embeddings. - morpho_to_ix : dict - Mapping morpheme to index. - - """ - for word in self.key_to_index: - morpheme_embedding = np.array( - [ - morpho_embeddings[morpho_to_ix.get(m, -1)] - for m in morfessor_model.viterbi_segment(word)[0] - ] - ).sum(axis=0) - self.vectors[self.get_index(word)] += morpheme_embedding - logger.info("Added morphemes to word vectors") diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py deleted file mode 100644 index 6de3c256ad..0000000000 --- a/gensim/models/wrappers/wordrank.py +++ /dev/null @@ -1,322 +0,0 @@ -# Copyright (C) 2017 Parul Sethi -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Python wrapper around `Wordrank `_. -Original paper: `"WordRank: Learning Word Embeddings via Robust Ranking " `_. - -Installation ------------- -Use `official guide `_ or this one - -* On Linux :: - - sudo yum install boost-devel #(on RedHat/Centos) - sudo apt-get install libboost-all-dev #(on Ubuntu) - - git clone https://bitbucket.org/shihaoji/wordrank - cd wordrank/ - # replace icc to gcc in install.sh - ./install.sh - -* On MacOS :: - - brew install cmake - brew install wget - brew install boost - brew install mercurial - - git clone https://bitbucket.org/shihaoji/wordrank - cd wordrank/ - # replace icc to gcc in install.sh - ./install.sh - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.models.wrappers import Wordrank - >>> - >>> path_to_wordrank_binary = '/path/to/wordrank/binary' - >>> model = Wordrank.train(path_to_wordrank_binary, corpus_file='text8', out_name='wr_model') - >>> - >>> print(model["hello"]) # prints vector for given words -""" - -from __future__ import division - -import logging -import os -import copy -import multiprocessing -from shutil import copyfile, rmtree - -from gensim import utils -from gensim.models.keyedvectors import KeyedVectors -from gensim.scripts.glove2word2vec import glove2word2vec - - -logger = logging.getLogger(__name__) - - -class Wordrank(KeyedVectors): - """Python wrapper using `Wordrank implementation `_ - - Communication between Wordrank and Python takes place by working with data - files on disk and calling the Wordrank binary and glove's helper binaries - (for preparing training data) with subprocess module. - - Warnings - -------- - This is **only** python wrapper for `Wordrank implementation `_, - you need to install original implementation first and pass the path to wordrank dir to ``wr_path``. - - """ - @classmethod - def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, - sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, - beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0): - """Train model. - - Parameters - ---------- - wr_path : str - Absolute path to the Wordrank directory. - corpus_file : str - Path to corpus file, expected space-separated tokens in a each line format. - out_name : str - Name of the directory which will be created (in wordrank folder) to save embeddings and training data: - * ``model_word_current_.txt`` - Word Embeddings saved after every dump_period. - * ``model_context_current_.txt`` - Context Embeddings saved after every dump_period. - * ``meta/vocab.txt`` - vocab file. - * ``meta/wiki.toy`` - word-word concurrence values. - size : int, optional - Dimensionality of the feature vectors. - window : int, optional - Number of context words to the left (and to the right, if `symmetric = 1`). - symmetric : {0, 1}, optional - If 1 - using symmetric windows, if 0 - will use only left context words. - min_count : int, optional - Ignore all words with total frequency lower than `min_count`. - max_vocab_size : int, optional - Upper bound on vocabulary size, i.e. keep the most frequent words. If 0 - no limit. - sgd_num : int, optional - Number of SGD taken for each data point. - lrate : float, optional - Learning rate (attention: too high diverges, give Nan). - period : int, optional - Period of xi variable updates. - iter : int, optional - Number of iterations (epochs) over the corpus. - epsilon : float, optional - Power scaling value for weighting function. - dump_period : int, optional - Period after which embeddings should be dumped. - reg : int, optional - Value of regularization parameter. - alpha : int, optional - Alpha parameter of gamma distribution. - beta : int, optional - Beta parameter of gamma distribution. - loss : {"logistic", "hinge"}, optional - Name of the loss function. - memory : float, optional - Soft limit for memory consumption, in GB. - np : int, optional - Number of process to execute (mpirun option). - cleanup_files : bool, optional - If True, delete directory and files used by this wrapper. - sorted_vocab : {0, 1}, optional - If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing. - ensemble : {0, 1}, optional - If 1 - use ensemble of word and context vectors. - - """ - - # prepare training data (cooccurrence matrix and vocab) - model_dir = os.path.join(wr_path, out_name) - meta_dir = os.path.join(model_dir, 'meta') - os.makedirs(meta_dir) - logger.info("Dumped data will be stored in '%s'", model_dir) - copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1])) - - vocab_file = os.path.join(meta_dir, 'vocab.txt') - temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt') - cooccurrence_file = os.path.join(meta_dir, 'cooccurrence') - cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy') - meta_file = os.path.join(meta_dir, 'meta') - - cmd_vocab_count = [ - os.path.join(wr_path, 'glove', 'vocab_count'), - '-min-count', str(min_count), '-max-vocab', str(max_vocab_size) - ] - cmd_cooccurence_count = [ - os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), - '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric) - ] - cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)] - cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] - - commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences] - input_fnames = [ - os.path.join(meta_dir, os.path.split(corpus_file)[-1]), - os.path.join(meta_dir, os.path.split(corpus_file)[-1]), - cooccurrence_file - ] - output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file] - - logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) - for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): - with utils.open(input_fname, 'rb') as r: - with utils.open(output_fname, 'wb') as w: - utils.check_output(w, args=command, stdin=r) - - logger.info("Deleting frequencies from vocab file") - with utils.open(vocab_file, 'wb') as w: - utils.check_output(w, args=cmd_del_vocab_freq) - - with utils.open(vocab_file, 'rb') as f: - numwords = sum(1 for _ in f) - with utils.open(cooccurrence_shuf_file, 'rb') as f: - numlines = sum(1 for _ in f) - with utils.open(meta_file, 'wb') as f: - meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format( - numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], - numwords, vocab_file.split('/')[-1] - ) - f.write(meta_info.encode('utf-8')) - - if iter % dump_period == 0: - iter += 1 - else: - logger.warning( - "Resultant embedding will be from %d iterations rather than the input %d iterations, " - "as wordrank dumps the embedding only at dump_period intervals. " - "Input an appropriate combination of parameters (iter, dump_period) " - "such that \"iter mod dump_period\" is zero.", - iter - (iter % dump_period), iter - ) - - wr_args = { - 'path': meta_dir, - 'nthread': multiprocessing.cpu_count(), - 'sgd_num': sgd_num, - 'lrate': lrate, - 'period': period, - 'iter': iter, - 'epsilon': epsilon, - 'dump_prefix': 'model', - 'dump_period': dump_period, - 'dim': size, - 'reg': reg, - 'alpha': alpha, - 'beta': beta, - 'loss': loss - } - - # run wordrank executable with wr_args - cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')] - for option, value in wr_args.items(): - cmd.append('--%s' % option) - cmd.append(str(value)) - logger.info("Running wordrank binary") - utils.check_output(args=cmd) - - # use embeddings from max. iteration's dump - max_iter_dump = iter - (iter % dump_period) - os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words')) - os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts')) - model = cls.load_wordrank_model( - os.path.join(model_dir, 'wordrank.words'), vocab_file, - os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble - ) - - if cleanup_files: - rmtree(model_dir) - return model - - @classmethod - def load_wordrank_model(cls, model_file, vocab_file=None, context_file=None, sorted_vocab=1, ensemble=1): - """Load model from `model_file`. - - Parameters - ---------- - model_file : str - Path to model in GloVe format. - vocab_file : str, optional - Path to file with vocabulary. - context_file : str, optional - Path to file with context-embedding in word2vec_format. - sorted_vocab : {0, 1}, optional - If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing. - ensemble : {0, 1}, optional - If 1 - use ensemble of word and context vectors. - - """ - model = cls.load_word2vec_format(model_file, binary=False, no_header=True) - if ensemble and context_file: - model.ensemble_embedding(model_file, context_file) - if sorted_vocab and vocab_file: - model.sort_embeddings(vocab_file) - return model - - def sort_embeddings(self, vocab_file): - """Sort embeddings according to word frequency. - - Parameters - ---------- - vocab_file : str - Path to file with vocabulary. - - """ - counts = {} - vocab_size = len(self.vocab) - prev_syn0 = copy.deepcopy(self.syn0) - prev_vocab = copy.deepcopy(self.vocab) - self.index2word = [] - - # sort embeddings using frequency sorted vocab file in wordrank - with utils.open(vocab_file, 'rb') as fin: - for index, line in enumerate(fin): - word, count = utils.to_unicode(line).strip(), vocab_size - index - # store word with it's count in a dict - counts[word] = int(count) - # build new index2word with frequency sorted words - self.index2word.append(word) - assert len(self.index2word) == vocab_size, 'mismatch between vocab sizes' - - for word_id, word in enumerate(self.index2word): - self.syn0[word_id] = prev_syn0[prev_vocab[word].index] - self.vocab[word].index = word_id - self.vocab[word].count = counts[word] - - def ensemble_embedding(self, word_embedding, context_embedding): - """Replace current syn0 with the sum of context and word embeddings. - - Parameters - ---------- - word_embedding : str - Path to word embeddings in GloVe format. - context_embedding : str - Path to context embeddings in word2vec_format. - - Returns - ------- - numpy.ndarray - Matrix with new embeddings. - - """ - glove2word2vec(context_embedding, context_embedding + '.w2vformat') - w_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % word_embedding) - c_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % context_embedding) - # compare vocab words using keys of dict vocab - assert set(w_emb.vocab) == set(c_emb.vocab), 'Vocabs are not same for both embeddings' - - # sort context embedding to have words in same order as word embedding - prev_c_emb = copy.deepcopy(c_emb.syn0) - for word_id, word in enumerate(w_emb.index2word): - c_emb.syn0[word_id] = prev_c_emb[c_emb.vocab[word].index] - # add vectors of the two embeddings - new_emb = w_emb.syn0 + c_emb.syn0 - self.syn0 = new_emb - return new_emb diff --git a/gensim/sklearn_api/__init__.py b/gensim/sklearn_api/__init__.py deleted file mode 100644 index 2f21fc3864..0000000000 --- a/gensim/sklearn_api/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Chinmaya Pancholi -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Scikit-learn wrappers for gensim. - -Contains various gensim based implementations which match with scikit-learn standards. -See `sklearn dev-guide `_ for complete set of conventions. - -""" -from .ldamodel import LdaTransformer # noqa: F401 -from .lsimodel import LsiTransformer # noqa: F401 -from .rpmodel import RpTransformer # noqa: F401 -from .ldaseqmodel import LdaSeqTransformer # noqa: F401 -from .w2vmodel import W2VTransformer # noqa: F401 -from .atmodel import AuthorTopicTransformer # noqa: F401 -from .d2vmodel import D2VTransformer # noqa: F401 -from .text2bow import Text2BowTransformer # noqa: F401 -from .tfidf import TfIdfTransformer # noqa: F401 -from .hdp import HdpTransformer # noqa: F401 -from .ftmodel import FTTransformer # noqa: F401 -from .phrases import PhrasesTransformer # noqa: F401 diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py deleted file mode 100644 index 1b07537c27..0000000000 --- a/gensim/sklearn_api/atmodel.py +++ /dev/null @@ -1,223 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Chinmaya Pancholi -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit learn interface for :class:`~gensim.models.atmodel.AuthorTopicModel`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.test.utils import common_dictionary, common_corpus - >>> from gensim.sklearn_api.atmodel import AuthorTopicTransformer - >>> - >>> # Pass a mapping from authors to the documents they contributed to. - >>> author2doc = { - ... 'john': [0, 1, 2, 3, 4, 5, 6], - ... 'jane': [2, 3, 4, 5, 6, 7, 8], - ... 'jack': [0, 2, 4, 6, 8] - ... } - >>> - >>> # Lets use the model to discover 2 different topics. - >>> model = AuthorTopicTransformer(id2word=common_dictionary, author2doc=author2doc, num_topics=2, passes=100) - >>> - >>> # In which of those 2 topics does jack mostly contribute to? - >>> topic_dist = model.fit(common_corpus).transform('jack') - -""" -import numpy as np -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim import models -from gensim import matutils - - -class AuthorTopicTransformer(TransformerMixin, BaseEstimator): - """Base Author Topic module, wraps :class:`~gensim.models.atmodel.AuthorTopicModel`. - - The model's internal workings are heavily based on `"The Author-Topic Model for Authors and Documents", - Osen-Zvi et. al 2004 `_. - - """ - def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=None, - chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, - alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, - gamma_threshold=0.001, serialized=False, serialization_path=None, - minimum_probability=0.01, random_state=None): - """ - - Parameters - ---------- - num_topics : int, optional - Number of requested latent topics to be extracted from the training corpus. - id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional - Mapping from a words' ID to the word itself. Used to determine the vocabulary size, as well as for debugging - and topic printing. - author2doc : dict of (str, list of int), optional - Maps an authors name to a list of document IDs where has has contributed. - Either `author2doc` or `doc2author` **must be supplied**. - doc2author : dict of (int, list of str) - Maps a document (using its ID) to a list of author names that contributed to it. - Either `author2doc` or `doc2author` **must be supplied**. - chunksize : int, optional - Number of documents to be processed by the model in each mini-batch. - passes : int, optional - Number of times the model can make a pass over the corpus during training. - iterations : int, optional - Maximum number of times the model before convergence during the M step of the EM algorithm. - decay : float, optional - A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten - when each new document is examined. Corresponds to Kappa from `"The Author-Topic Model for Authors - and Documents", Osen-Zvi et. al 2004 `_. - offset : float, optional - Hyper-parameter that controls how much we will slow down the first steps the first few iterations. - Corresponds to Tau_0 from `"The Author-Topic Model for Authors and Documents", Osen-Zvi et. al 2004 - `_. - alpha : {np.ndarray, str}, optional - Can be set to an 1D array of length equal to the number of expected topics that expresses - our a-priori belief for the each topics' probability. - Alternatively default prior selecting strategies can be employed by supplying a string: - - * 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / topicno`. - * 'auto': Learns an asymmetric prior from the corpus. - eta : {float, np.array, str}, optional - A-priori belief on word probability, this can be: - - * scalar for a symmetric prior over topic/word probability, - * vector of length num_words to denote an asymmetric user defined probability for each word, - * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination, - * the string 'auto' to learn the asymmetric prior from the data. - update_every : int, optional - Number of mini-batches between each model update. - eval_every : int, optional - Number of updates between two log perplexity estimates. - Set to None to disable perplexity estimation. - gamma_threshold : float, optional - Minimum change in the value of the gamma parameters to continue iterating. - serialized : bool, optional - Indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`) - or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from - other Gensim models. If your data is too large to fit in to memory, use this functionality. - serialization_path : str, optional - Path to file that used for storing the serialized object, **must be supplied if `serialized = True`**. - An existing file *cannot* be overwritten, either delete the old file or choose a different name. - minimum_probability : float, optional - Topics with a probability lower than this threshold will be filtered out. - random_state : {np.random.RandomState, int}, optional - Either a randomState object or a seed to generate one. Useful for reproducibility. - - """ - self.gensim_model = None - self.num_topics = num_topics - self.id2word = id2word - self.author2doc = author2doc - self.doc2author = doc2author - self.chunksize = chunksize - self.passes = passes - self.iterations = iterations - self.decay = decay - self.offset = offset - self.alpha = alpha - self.eta = eta - self.update_every = update_every - self.eval_every = eval_every - self.gamma_threshold = gamma_threshold - self.serialized = serialized - self.serialization_path = serialization_path - self.minimum_probability = minimum_probability - self.random_state = random_state - - def fit(self, X, y=None): - """Fit the model according to the given training data. - - Parameters - ---------- - X : iterable of list of (int, number) - Sequence of documents in BoW format. - - Returns - ------- - :class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer` - The trained model. - - """ - self.gensim_model = models.AuthorTopicModel( - corpus=X, num_topics=self.num_topics, id2word=self.id2word, - author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes, - iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta, - update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, - serialized=self.serialized, serialization_path=self.serialization_path, - minimum_probability=self.minimum_probability, random_state=self.random_state - ) - return self - - def transform(self, author_names): - """Infer the topic probabilities for each author. - - Parameters - ---------- - author_names : {iterable of str, str} - Author name or sequence of author names whose topics will be identified. - - Returns - ------- - numpy.ndarray - Topic distribution for each input author. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - # The input as array of arrays - if not isinstance(author_names, list): - author_names = [author_names] - # returning dense representation for compatibility with sklearn - # but we should go back to sparse representation in the future - topics = [matutils.sparse2full(self.gensim_model[author_name], self.num_topics) for author_name in author_names] - return np.reshape(np.array(topics), (len(author_names), self.num_topics)) - - def partial_fit(self, X, author2doc=None, doc2author=None): - """Train model over a potentially incomplete set of documents. - - This method can be used in two ways: - * On an unfitted model in which case the model is initialized and trained on `X`. - * On an already fitted model in which case the model is **updated** by `X`. - - - Parameters - ---------- - X : iterable of list of (int, number) - Sequence of documents in BoW format. - author2doc : dict of (str, list of int), optional - Maps an authors name to a list of document IDs where has has contributed. - Either `author2doc` or `doc2author` **must be supplied**. - doc2author : dict of (int, list of str) - Maps a document (using its ID) to a list of author names that contributed to it. - Either `author2doc` or `doc2author` **must be supplied**. - - Returns - ------- - :class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer` - The trained model. - - """ - if self.gensim_model is None: - self.gensim_model = models.AuthorTopicModel( - corpus=X, num_topics=self.num_topics, id2word=self.id2word, - author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes, - iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta, - update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, - serialized=self.serialized, serialization_path=self.serialization_path, - minimum_probability=self.minimum_probability, random_state=self.random_state - ) - - self.gensim_model.update(corpus=X, author2doc=author2doc, doc2author=doc2author) - return self diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py deleted file mode 100644 index 660d101131..0000000000 --- a/gensim/sklearn_api/d2vmodel.py +++ /dev/null @@ -1,202 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2011 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit learn interface for :class:`~gensim.models.doc2vec.Doc2Vec`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.test.utils import common_texts - >>> from gensim.sklearn_api import D2VTransformer - >>> - >>> model = D2VTransformer(min_count=1, vector_size=5) - >>> docvecs = model.fit_transform(common_texts) # represent `common_texts` as vectors - -""" - -import numpy as np - -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim import models -from gensim.models import doc2vec - - -class D2VTransformer(TransformerMixin, BaseEstimator): - """Base Doc2Vec module, wraps :class:`~gensim.models.doc2vec.Doc2Vec`. - - This model based on `Quoc Le, Tomas Mikolov: "Distributed Representations of Sentences and Documents" - `_. - - """ - def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, dv=None, - dv_mapfile=None, comment=None, trim_rule=None, vector_size=100, alpha=0.025, window=5, - min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - hs=0, negative=5, cbow_mean=1, - hashfxn=hash, epochs=5, sorted_vocab=1, batch_words=10000): - """ - - Parameters - ---------- - - dm_mean : int {1,0}, optional - If 0, use the sum of the context word vectors. If 1, use the mean. Only applies when `dm_concat=0`. - dm : int {1,0}, optional - Defines the training algorithm. If `dm=1` - distributed memory (PV-DM) is used. - Otherwise, distributed bag of words (PV-DBOW) is employed. - dbow_words : int {1,0}, optional - If set to 1 - trains word-vectors (in skip-gram fashion) simultaneous with DBOW - doc-vector training, If 0, only trains doc-vectors (faster). - dm_concat : int {1,0}, optional - If 1, use concatenation of context vectors rather than sum/average. - Note concatenation results in a much-larger model, as the input is no longer the size of one - (sampled or arithmetically combined) word vector, but the size of the tag(s) and all words - in the context strung together. - dm_tag_count : int, optional - Expected constant number of document tags per document, when using dm_concat mode. - dv : :class:`~gensim.models.keyedvectors.KeyedVectors` - A mapping from a string or int tag to its vector representation. - dv_mapfile : str, optional - Path to a file containing the docvecs mapping. If `dv` is None, this file will be used to create it. - comment : str, optional - A model descriptive comment, used for logging and debugging purposes. - trim_rule : function ((str, int, int) -> int), optional - Vocabulary trimming rule that accepts (word, count, min_count). - Specifies whether certain words should remain in the vocabulary (:attr:`gensim.utils.RULE_KEEP`), - be trimmed away (:attr:`gensim.utils.RULE_DISCARD`), or handled using the default - (:attr:`gensim.utils.RULE_DEFAULT`). - If None, then :func:`gensim.utils.keep_vocab_item` will be used. - vector_size : int, optional - Dimensionality of the feature vectors. - alpha : float, optional - The initial learning rate. - window : int, optional - The maximum distance between the current and predicted word within a sentence. - min_count : int, optional - Ignores all words with total frequency lower than this. - max_vocab_size : int, optional - Limits the RAM during vocabulary building; if there are more unique - words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. - Set to `None` for no limit. - sample : float, optional - The threshold for configuring which higher-frequency words are randomly downsampled, - useful range is (0, 1e-5). - seed : int, optional - Seed for the random number generator. Initial vectors for each word are seeded with a hash of - the concatenation of word + `str(seed)`. - Note that for a **fully deterministically-reproducible run**, you **must also limit the model to - a single worker thread (`workers=1`)**, to eliminate ordering jitter from OS thread scheduling. - In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED` - environment variable to control hash randomization. - workers : int, optional - Use this many worker threads to train the model. Will yield a speedup when training with multicore machines. - min_alpha : float, optional - Learning rate will linearly drop to `min_alpha` as training progresses. - hs : int {1,0}, optional - If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, - negative sampling will be used. - negative : int, optional - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" - should be drawn (usually between 5-20). If set to 0, no negative sampling is used. - cbow_mean : int, optional - Same as `dm_mean`, **unused**. - hashfxn : function (object -> int), optional - A hashing function. Used to create an initial random reproducible vector by hashing the random seed. - epochs : int, optional - Number of epochs to iterate through the corpus. - sorted_vocab : bool, optional - Whether the vocabulary should be sorted internally. - batch_words : int, optional - Number of words to be handled by each job. - - """ - self.gensim_model = None - self.dm_mean = dm_mean - self.dm = dm - self.dbow_words = dbow_words - self.dm_concat = dm_concat - self.dm_tag_count = dm_tag_count - self.dv = dv - self.dv_mapfile = dv_mapfile - self.comment = comment - self.trim_rule = trim_rule - - # attributes associated with gensim.models.Word2Vec - self.vector_size = vector_size - self.alpha = alpha - self.window = window - self.min_count = min_count - self.max_vocab_size = max_vocab_size - self.sample = sample - self.seed = seed - self.workers = workers - self.min_alpha = min_alpha - self.hs = hs - self.negative = negative - self.cbow_mean = int(cbow_mean) - self.hashfxn = hashfxn - self.epochs = epochs - self.sorted_vocab = sorted_vocab - self.batch_words = batch_words - - def fit(self, X, y=None): - """Fit the model according to the given training data. - - Parameters - ---------- - X : {iterable of :class:`~gensim.models.doc2vec.TaggedDocument`, iterable of list of str} - A collection of tagged documents used for training the model. - - Returns - ------- - :class:`~gensim.sklearn_api.d2vmodel.D2VTransformer` - The trained model. - - """ - if isinstance([i for i in X[:1]][0], doc2vec.TaggedDocument): - d2v_sentences = X - else: - d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(X)] - self.gensim_model = models.Doc2Vec( - documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm, - dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, - dv=self.dv, dv_mapfile=self.dv_mapfile, comment=self.comment, - trim_rule=self.trim_rule, vector_size=self.vector_size, alpha=self.alpha, window=self.window, - min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, - seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, - negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, - epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words - ) - return self - - def transform(self, docs): - """Infer the vector representations for the input documents. - - Parameters - ---------- - docs : {iterable of list of str, list of str} - Input document or sequence of documents. - - Returns - ------- - numpy.ndarray of shape [`len(docs)`, `size`] - The vector representation of the `docs`. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - # The input as array of array - if isinstance(docs[0], str): - docs = [docs] - vectors = [self.gensim_model.infer_vector(doc) for doc in docs] - return np.reshape(np.array(vectors), (len(docs), self.gensim_model.vector_size)) diff --git a/gensim/sklearn_api/ftmodel.py b/gensim/sklearn_api/ftmodel.py deleted file mode 100644 index c42f95274e..0000000000 --- a/gensim/sklearn_api/ftmodel.py +++ /dev/null @@ -1,228 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Authors: M.Cemil Guney -# Copyright (C) 2018 RaRe Technologies s.r.o. -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit-learn interface for :class:`~gensim.models.fasttext.FastText`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.test.utils import common_texts - >>> from gensim.sklearn_api import FTTransformer - >>> - >>> # Create a model to represent each word by a 10 dimensional vector. - >>> model = FTTransformer(vector_size=10, min_count=1, seed=1) - >>> - >>> # What is the vector representations of the word 'graph' and 'system'? - >>> wordvecs = model.fit(common_texts).transform(['graph', 'system']) - >>> assert wordvecs.shape == (2, 10) - -Retrieve word-vector for vocab and out-of-vocab word: - -.. sourcecode:: pycon - - >>> existent_word = "system" - >>> existent_word in model.gensim_model.wv.vocab - True - >>> existent_word_vec = model.transform(existent_word) # numpy vector of a word - >>> assert existent_word_vec.shape == (1, 10) - >>> - >>> oov_word = "sys" - >>> oov_word in model.gensim_model.wv.vocab - False - >>> oov_word_vec = model.transform(oov_word) # numpy vector of a word - >>> assert oov_word_vec.shape == (1, 10) - -""" -import numpy as np - -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim import models - - -class FTTransformer(TransformerMixin, BaseEstimator): - """Base FastText module, wraps :class:`~gensim.models.fasttext.FastText`. - - For more information please have a look to `Enriching Word Vectors with Subword - Information `_. - - """ - def __init__(self, sg=0, hs=0, vector_size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, - workers=3, min_alpha=0.0001, negative=5, ns_exponent=0.75, - cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, - max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, - batch_words=10000): - """ - - Parameters - ---------- - sg : {1, 0}, optional - Training algorithm: skip-gram if `sg=1`, otherwise CBOW. - hs : {1,0}, optional - If 1, hierarchical softmax will be used for model training. - If set to 0, and `negative` is non-zero, negative sampling will be used. - vector_size : int, optional - Dimensionality of the word vectors. - alpha : float, optional - The initial learning rate. - window : int, optional - The maximum distance between the current and predicted word within a sentence. - min_count : int, optional - The model ignores all words with total frequency lower than this. - max_vocab_size : int, optional - Limits the RAM during vocabulary building; if there are more unique - words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. - Set to `None` for no limit. - word_ngrams : {1,0}, optional - If 1, uses enriches word vectors with subword(n-grams) information. - If 0, this is equivalent to :class:`~gensim.models.word2vec.Word2Vec`. - sample : float, optional - The threshold for configuring which higher-frequency words are randomly downsampled, - useful range is (0, 1e-5). - seed : int, optional - Seed for the random number generator. Initial vectors for each word are seeded with a hash of - the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, - you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter - from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires - use of the `PYTHONHASHSEED` environment variable to control hash randomization). - workers : int, optional - Use these many worker threads to train the model (=faster training with multicore machines). - min_alpha : float, optional - Learning rate will linearly drop to `min_alpha` as training progresses. - negative : int, optional - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" - should be drawn (usually between 5-20). - If set to 0, no negative sampling is used. - ns_exponent : float, optional - The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion - to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more - than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper. - More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that - other values may perform better for recommendation applications. - cbow_mean : {1,0}, optional - If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. - hashfxn : function, optional - Hash function to use to randomly initialize weights, for increased training reproducibility. - epochs : int, optional - Number of iterations (epochs) over the corpus. - min_n : int, optional - Minimum length of char n-grams to be used for training word representations. - max_n : int, optional - Max length of char ngrams to be used for training word representations. Set `max_n` to be - lesser than `min_n` to avoid char ngrams being used. - sorted_vocab : {1,0}, optional - If 1, sort the vocabulary by descending frequency before assigning word indices. - bucket : int, optional - Character ngrams are hashed into a fixed number of buckets, in order to limit the - memory usage of the model. This option specifies the number of buckets used by the model. - trim_rule : function, optional - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The rule, if given, is only used to prune vocabulary during - :meth:`~gensim.models.fasttext.FastText.build_vocab` and is not stored as part of themodel. - - The input parameters are of the following types: - * `word` (str) - the word we are examining - * `count` (int) - the word's frequency count in the corpus - * `min_count` (int) - the minimum count threshold. - - batch_words : int, optional - Target size (in words) for batches of examples passed to worker threads (and - thus cython routines).(Larger batches will be passed if individual - texts are longer than 10000 words, but the standard cython code truncates to that maximum.) - - """ - self.gensim_model = None - self.sg = sg - self.hs = hs - self.vector_size = vector_size - self.alpha = alpha - self.window = window - self.min_count = min_count - self.max_vocab_size = max_vocab_size - self.word_ngrams = word_ngrams - self.sample = sample - self.seed = seed - self.workers = workers - self.min_alpha = min_alpha - self.negative = negative - self.ns_exponent = ns_exponent - self.cbow_mean = cbow_mean - self.hashfxn = hashfxn - self.epochs = epochs - self.null_word = null_word - self.min_n = min_n - self.max_n = max_n - self.sorted_vocab = sorted_vocab - self.bucket = bucket - self.trim_rule = trim_rule - self.batch_words = batch_words - - def fit(self, X, y=None): - """Fit the model according to the given training data. - - Parameters - ---------- - X : iterable of iterables of str - Can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - - Returns - ------- - :class:`~gensim.sklearn_api.ftmodel.FTTransformer` - The trained model. - - """ - self.gensim_model = models.FastText( - sentences=X, sg=self.sg, hs=self.hs, vector_size=self.vector_size, - alpha=self.alpha, window=self.window, min_count=self.min_count, - max_vocab_size=self.max_vocab_size, word_ngrams=self.word_ngrams, - sample=self.sample, seed=self.seed, workers=self.workers, - min_alpha=self.min_alpha, negative=self.negative, - ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean, - hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, - min_n=self.min_n, max_n=self.max_n, sorted_vocab=self.sorted_vocab, - bucket=self.bucket, trim_rule=self.trim_rule, - batch_words=self.batch_words - ) - return self - - def transform(self, words): - """Get the word vectors the input words. - - Parameters - ---------- - words : {iterable of str, str} - Word or a collection of words to be transformed. - - Returns - ------- - np.ndarray of shape [`len(words)`, `vector_size`] - A 2D array where each row is the vector of one word. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - # The input as array of array - if isinstance(words, str): - words = [words] - vectors = [self.gensim_model.wv[word] for word in words] - return np.reshape(np.array(vectors), (len(words), self.vector_size)) diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py deleted file mode 100644 index e98c1916c8..0000000000 --- a/gensim/sklearn_api/hdp.py +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2011 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit learn interface for :class:`~gensim.models.hdpmodel.HdpModel`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.test.utils import common_dictionary, common_corpus - >>> from gensim.sklearn_api import HdpTransformer - >>> - >>> # Lets extract the distribution of each document in topics - >>> model = HdpTransformer(id2word=common_dictionary) - >>> distr = model.fit_transform(common_corpus) - -""" -import numpy as np -from scipy import sparse -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim import models -from gensim import matutils - - -class HdpTransformer(TransformerMixin, BaseEstimator): - """Base HDP module, wraps :class:`~gensim.models.hdpmodel.HdpModel`. - - The inner workings of this class heavily depends on `Wang, Paisley, Blei: "Online Variational - Inference for the Hierarchical Dirichlet Process, JMLR (2011)" - `_. - - """ - def __init__(self, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, - alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): - """ - - Parameters - ---------- - id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional - Mapping between a words ID and the word itself in the vocabulary. - max_chunks : int, optional - Upper bound on how many chunks to process.It wraps around corpus beginning in another corpus pass, - if there are not enough chunks in the corpus. - max_time : int, optional - Upper bound on time in seconds for which model will be trained. - chunksize : int, optional - Number of documents to be processed by the model in each mini-batch. - kappa : float, optional - Learning rate, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical Dirichlet - Process, JMLR (2011)" `_. - tau : float, optional - Slow down parameter, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical - Dirichlet Process, JMLR (2011)" `_. - K : int, optional - Second level truncation level, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical - Dirichlet Process, JMLR (2011)" `_. - T : int, optional - Top level truncation level, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical - Dirichlet Process, JMLR (2011)" `_. - alpha : int, optional - Second level concentration, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical - Dirichlet Process, JMLR (2011)" `_. - gamma : int, optional - First level concentration, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical - Dirichlet Process, JMLR (2011)" `_. - eta : float, optional - The topic Dirichlet, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical - Dirichlet Process, JMLR (2011)" `_. - scale : float, optional - Weights information from the mini-chunk of corpus to calculate rhot. - var_converge : float, optional - Lower bound on the right side of convergence. Used when updating variational parameters - for a single document. - outputdir : str, optional - Path to a directory where topic and options information will be stored. - random_state : int, optional - Seed used to create a :class:`~np.random.RandomState`. Useful for obtaining reproducible results. - - """ - self.gensim_model = None - self.id2word = id2word - self.max_chunks = max_chunks - self.max_time = max_time - self.chunksize = chunksize - self.kappa = kappa - self.tau = tau - self.K = K - self.T = T - self.alpha = alpha - self.gamma = gamma - self.eta = eta - self.scale = scale - self.var_converge = var_converge - self.outputdir = outputdir - self.random_state = random_state - - def fit(self, X, y=None): - """Fit the model according to the given training data. - - Parameters - ---------- - X : {iterable of list of (int, number), scipy.sparse matrix} - A collection of documents in BOW format used for training the model. - - Returns - ------- - :class:`~gensim.sklearn_api.hdp.HdpTransformer` - The trained model. - - """ - if sparse.issparse(X): - corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False) - else: - corpus = X - - self.gensim_model = models.HdpModel( - corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks, - max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, - K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, - var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state - ) - return self - - def transform(self, docs): - """Infer a matrix of topic distribution for the given document bow, where a_ij - indicates (topic_i, topic_probability_j). - - Parameters - ---------- - docs : {iterable of list of (int, number), list of (int, number)} - Document or sequence of documents in BOW format. - - Returns - ------- - numpy.ndarray of shape [`len(docs), num_topics`] - Topic distribution for `docs`. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - # The input as array of array - if isinstance(docs[0], tuple): - docs = [docs] - distribution, max_num_topics = [], 0 - - for doc in docs: - topicd = self.gensim_model[doc] - distribution.append(topicd) - max_num_topics = max(max_num_topics, max(topic[0] for topic in topicd) + 1) - - # returning dense representation for compatibility with sklearn - # but we should go back to sparse representation in the future - distribution = [matutils.sparse2full(t, max_num_topics) for t in distribution] - return np.reshape(np.array(distribution), (len(docs), max_num_topics)) - - def partial_fit(self, X): - """Train model over a potentially incomplete set of documents. - - Uses the parameters set in the constructor. - This method can be used in two ways: - * On an unfitted model in which case the model is initialized and trained on `X`. - * On an already fitted model in which case the model is **updated** by `X`. - - Parameters - ---------- - X : {iterable of list of (int, number), scipy.sparse matrix} - A collection of documents in BOW format used for training the model. - - Returns - ------- - :class:`~gensim.sklearn_api.hdp.HdpTransformer` - The trained model. - - """ - if sparse.issparse(X): - X = matutils.Sparse2Corpus(sparse=X, documents_columns=False) - - if self.gensim_model is None: - self.gensim_model = models.HdpModel( - id2word=self.id2word, max_chunks=self.max_chunks, - max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, - K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, - var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state - ) - - self.gensim_model.update(corpus=X) - return self diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py deleted file mode 100644 index ed12b95369..0000000000 --- a/gensim/sklearn_api/ldamodel.py +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Chinmaya Pancholi -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit learn interface for :class:`~gensim.models.ldamodel.LdaModel`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.test.utils import common_corpus, common_dictionary - >>> from gensim.sklearn_api import LdaTransformer - >>> - >>> # Reduce each document to 2 dimensions (topics) using the sklearn interface. - >>> model = LdaTransformer(num_topics=2, id2word=common_dictionary, iterations=20, random_state=1) - >>> docvecs = model.fit_transform(common_corpus) - -""" -import numpy as np -from scipy import sparse -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim import models -from gensim import matutils - - -class LdaTransformer(TransformerMixin, BaseEstimator): - """Base LDA module, wraps :class:`~gensim.models.ldamodel.LdaModel`. - - The inner workings of this class depends heavily on `Matthew D. Hoffman, David M. Blei, Francis Bach: - "Online Learning for Latent Dirichlet Allocation NIPS'10" `_ and - `David M. Blei, Andrew Y. Ng, Michael I. Jordan: "Latent Dirichlet Allocation" - `_. - - """ - def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, update_every=1, alpha='symmetric', - eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, - minimum_probability=0.01, random_state=None, scorer='perplexity', dtype=np.float32): - """ - - Parameters - ---------- - num_topics : int, optional - The number of requested latent topics to be extracted from the training corpus. - id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional - Mapping from integer ID to words in the corpus. Used to determine vocabulary size and logging. - chunksize : int, optional - Number of documents in batch. - passes : int, optional - Number of passes through the corpus during training. - update_every : int, optional - Number of documents to be iterated through for each update. - Set to 0 for batch learning, > 1 for online iterative learning. - alpha : {np.ndarray, str}, optional - Can be set to an 1D array of length equal to the number of expected topics that expresses - our a-priori belief for the each topics' probability. - Alternatively default prior selecting strategies can be employed by supplying a string: - - * 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / topicno`. - * 'auto': Learns an asymmetric prior from the corpus. - eta : {float, np.array, str}, optional - A-priori belief on word probability, this can be: - - * scalar for a symmetric prior over topic/word probability, - * vector of length num_words to denote an asymmetric user defined probability for each word, - * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination, - * the string 'auto' to learn the asymmetric prior from the data. - decay : float, optional - A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten - when each new document is examined. Corresponds to Kappa from - `Matthew D. Hoffman, David M. Blei, Francis Bach: - "Online Learning for Latent Dirichlet Allocation NIPS'10" `_. - offset : float, optional - Hyper-parameter that controls how much we will slow down the first steps the first few iterations. - Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach: - "Online Learning for Latent Dirichlet Allocation NIPS'10" `_. - eval_every : int, optional - Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x. - iterations : int, optional - Maximum number of iterations through the corpus when inferring the topic distribution of a corpus. - gamma_threshold : float, optional - Minimum change in the value of the gamma parameters to continue iterating. - minimum_probability : float, optional - Topics with a probability lower than this threshold will be filtered out. - random_state : {np.random.RandomState, int}, optional - Either a randomState object or a seed to generate one. Useful for reproducibility. - scorer : str, optional - Method to compute a score reflecting how well the model has fit the input corpus, allowed values are: - * 'perplexity': Perplexity of language model - * 'mass_u': Use :class:`~gensim.models.coherencemodel.CoherenceModel` to compute a topics coherence. - dtype : {numpy.float16, numpy.float32, numpy.float64}, optional - Data-type to use during calculations inside model. All inputs are also converted. - - Notes - ----- - Configure `passes` and `update_every` params to choose the mode among: - * online (single-pass): update_every != None and passes == 1 - * online (multi-pass): update_every != None and passes > 1 - * batch: update_every == None - - By default, 'online (single-pass)' mode is used for training the LDA model. - - """ - self.gensim_model = None - self.num_topics = num_topics - self.id2word = id2word - self.chunksize = chunksize - self.passes = passes - self.update_every = update_every - self.alpha = alpha - self.eta = eta - self.decay = decay - self.offset = offset - self.eval_every = eval_every - self.iterations = iterations - self.gamma_threshold = gamma_threshold - self.minimum_probability = minimum_probability - self.random_state = random_state - self.scorer = scorer - self.dtype = dtype - - def fit(self, X, y=None): - """Fit the model according to the given training data. - - Parameters - ---------- - X : {iterable of iterable of (int, int), scipy.sparse matrix} - A collection of documents in BOW format used for training the model. - - Returns - ------- - :class:`~gensim.sklearn_api.ldamodel.LdaTransformer` - The trained model. - - """ - if sparse.issparse(X): - corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False) - else: - corpus = X - - self.gensim_model = models.LdaModel( - corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, - chunksize=self.chunksize, passes=self.passes, update_every=self.update_every, - alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, - eval_every=self.eval_every, iterations=self.iterations, - gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability, - random_state=self.random_state, dtype=self.dtype - ) - return self - - def transform(self, docs): - """Infer the topic distribution for `docs`. - - Parameters - ---------- - docs : {iterable of list of (int, number), list of (int, number)} - Document or sequence of documents in BoW format. - - Returns - ------- - numpy.ndarray of shape [`len(docs)`, `num_topics`] - The topic distribution for each input document. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - # The input as array of array - if isinstance(docs[0], tuple): - docs = [docs] - # returning dense representation for compatibility with sklearn - # but we should go back to sparse representation in the future - distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs] - return np.reshape(np.array(distribution), (len(docs), self.num_topics)) - - def partial_fit(self, X): - """Train model over a potentially incomplete set of documents. - - Uses the parameters set in the constructor. - This method can be used in two ways: - * On an unfitted model in which case the model is initialized and trained on `X`. - * On an already fitted model in which case the model is **updated** by `X`. - - Parameters - ---------- - X : {iterable of iterable of (int, int), scipy.sparse matrix} - A collection of documents in BOW format used for training the model. - - Returns - ------- - :class:`~gensim.sklearn_api.ldamodel.LdaTransformer` - The trained model. - - """ - if sparse.issparse(X): - X = matutils.Sparse2Corpus(sparse=X, documents_columns=False) - - if self.gensim_model is None: - self.gensim_model = models.LdaModel( - num_topics=self.num_topics, id2word=self.id2word, - chunksize=self.chunksize, passes=self.passes, update_every=self.update_every, - alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, - eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold, - minimum_probability=self.minimum_probability, random_state=self.random_state, - dtype=self.dtype - ) - - self.gensim_model.update(corpus=X) - return self - - def score(self, X, y=None): - """Compute score reflecting how well the model has fitted for the input data. - - The scoring method is set using the `scorer` argument in :meth:`~gensim.sklearn_api.ldamodel.LdaTransformer`. - Higher score is better. - - Parameters - ---------- - X : iterable of list of (int, number) - Sequence of documents in BOW format. - - Returns - ------- - float - The score computed based on the selected method. - - """ - if self.scorer == 'perplexity': - corpus_words = sum(cnt for document in X for _, cnt in document) - subsample_ratio = 1.0 - perwordbound = \ - self.gensim_model.bound(X, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) - return -1 * np.exp2(-perwordbound) # returning (-1*perplexity) to select model with minimum value - elif self.scorer == 'u_mass': - goodcm = models.CoherenceModel(model=self.gensim_model, corpus=X, coherence=self.scorer, topn=3) - return goodcm.get_coherence() - else: - raise ValueError("Invalid value {} supplied for `scorer` param".format(self.scorer)) diff --git a/gensim/sklearn_api/ldaseqmodel.py b/gensim/sklearn_api/ldaseqmodel.py deleted file mode 100644 index cb70242129..0000000000 --- a/gensim/sklearn_api/ldaseqmodel.py +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Chinmaya Pancholi -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit learn interface for :class:`~gensim.models.ldaseqmodel.LdaSeqModel`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.test.utils import common_corpus, common_dictionary - >>> from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer - >>> - >>> # Create a sequential LDA transformer to extract 2 topics from the common corpus. - >>> # Divide the work into 3 unequal time slices. - >>> model = LdaSeqTransformer(id2word=common_dictionary, num_topics=2, time_slice=[3, 4, 2], initialize='gensim') - >>> - >>> # Each document almost entirely belongs to one of the two topics. - >>> transformed_corpus = model.fit_transform(common_corpus) - -""" -import numpy as np -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim import models - - -class LdaSeqTransformer(TransformerMixin, BaseEstimator): - """Base Sequential LDA module, wraps :class:`~gensim.models.ldaseqmodel.LdaSeqModel` model. - - For more information take a look at `David M. Blei, John D. Lafferty: "Dynamic Topic Models" - `_. - - """ - def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None, - lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None, - lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): - """ - - Parameters - ---------- - time_slice : list of int, optional - Number of documents in each time-slice. - id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional - Mapping from an ID to the word it represents in the vocabulary. - alphas : float, optional - The prior probability of each topic. - num_topics : int, optional - Number of latent topics to be discovered in the corpus. - initialize : {'gensim', 'own', 'ldamodel'}, optional - Controls the initialization of the DTM model. Supports three different modes: - * 'gensim': Uses gensim's own LDA initialization. - * 'own': Uses your own initialization matrix of an LDA model that has been previously trained. - * 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument. - sstats : np.ndarray of shape [vocab_len, `num_topics`], optional - If `initialize` is set to 'own' this will be used to initialize the DTM model. - lda_model : :class:`~gensim.models.ldamodel.LdaModel`, optional - If `initialize` is set to 'lda_model' this object will be used to create the `sstats` initialization matrix. - obs_variance : float, optional - Observed variance used to approximate the true and forward variance as shown in - `David M. Blei, John D. Lafferty: "Dynamic Topic Models" - `_. - chain_variance : float, optional - Gaussian parameter defined in the beta distribution to dictate how the beta values evolve. - passes : int, optional - Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel` - random_state : {numpy.random.RandomState, int}, optional - Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results. - lda_inference_max_iter : int, optional - Maximum number of iterations in the inference step of the LDA training. - em_min_iter : int, optional - Minimum number of iterations until converge of the Expectation-Maximization algorithm - em_max_iter : int, optional - Maximum number of iterations until converge of the Expectation-Maximization algorithm - chunksize : int, optional - Number of documents in the corpus do be processed in in a chunk. - - """ - self.gensim_model = None - self.time_slice = time_slice - self.id2word = id2word - self.alphas = alphas - self.num_topics = num_topics - self.initialize = initialize - self.sstats = sstats - self.lda_model = lda_model - self.obs_variance = obs_variance - self.chain_variance = chain_variance - self.passes = passes - self.random_state = random_state - self.lda_inference_max_iter = lda_inference_max_iter - self.em_min_iter = em_min_iter - self.em_max_iter = em_max_iter - self.chunksize = chunksize - - def fit(self, X, y=None): - """Fit the model according to the given training data. - - Parameters - ---------- - X : {iterable of list of (int, number), scipy.sparse matrix} - A collection of documents in BOW format used for training the model. - - Returns - ------- - :class:`~gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer` - The trained model. - - """ - self.gensim_model = models.LdaSeqModel( - corpus=X, time_slice=self.time_slice, id2word=self.id2word, - alphas=self.alphas, num_topics=self.num_topics, initialize=self.initialize, sstats=self.sstats, - lda_model=self.lda_model, obs_variance=self.obs_variance, chain_variance=self.chain_variance, - passes=self.passes, random_state=self.random_state, lda_inference_max_iter=self.lda_inference_max_iter, - em_min_iter=self.em_min_iter, em_max_iter=self.em_max_iter, chunksize=self.chunksize - ) - return self - - def transform(self, docs): - """Infer the topic distribution for `docs`. - - Parameters - ---------- - docs : {iterable of list of (int, number), scipy.sparse matrix} - A collection of documents in BOW format to be transformed. - - Returns - ------- - numpy.ndarray of shape [`len(docs)`, `num_topics`] - The topic representation of each document. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - # The input as array of array - if isinstance(docs[0], tuple): - docs = [docs] - proportions = [self.gensim_model[doc] for doc in docs] - return np.reshape(np.array(proportions), (len(docs), self.num_topics)) diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py deleted file mode 100644 index 3c37c1b415..0000000000 --- a/gensim/sklearn_api/lsimodel.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Chinmaya Pancholi -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit learn interface for :class:`gensim.models.lsimodel.LsiModel`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - -Examples --------- -Integrate with sklearn Pipelines: - -.. sourcecode:: pycon - - >>> from sklearn.pipeline import Pipeline - >>> from sklearn import linear_model - >>> from gensim.test.utils import common_corpus, common_dictionary - >>> from gensim.sklearn_api import LsiTransformer - >>> - >>> # Create stages for our pipeline (including gensim and sklearn models alike). - >>> model = LsiTransformer(num_topics=15, id2word=common_dictionary) - >>> clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - >>> pipe = Pipeline([('features', model,), ('classifier', clf)]) - >>> - >>> # Create some random binary labels for our documents. - >>> labels = np.random.choice([0, 1], len(common_corpus)) - >>> - >>> # How well does our pipeline perform on the training set? - >>> score = pipe.fit(common_corpus, labels).score(common_corpus, labels) - -""" -import numpy as np -from scipy import sparse -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim import models -from gensim import matutils - - -class LsiTransformer(TransformerMixin, BaseEstimator): - """Base LSI module, wraps :class:`~gensim.models.lsimodel.LsiModel`. - - For more information please have a look to `Latent semantic analysis - `_. - - """ - def __init__(self, num_topics=200, id2word=None, chunksize=20000, - decay=1.0, onepass=True, power_iters=2, extra_samples=100): - """ - - Parameters - ---------- - num_topics : int, optional - Number of requested factors (latent dimensions). - id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional - ID to word mapping, optional. - chunksize : int, optional - Number of documents to be used in each training chunk. - decay : float, optional - Weight of existing observations relatively to new ones. - onepass : bool, optional - Whether the one-pass algorithm should be used for training, pass `False` to force a - multi-pass stochastic algorithm. - power_iters: int, optional - Number of power iteration steps to be used. - Increasing the number of power iterations improves accuracy, but lowers performance. - extra_samples : int, optional - Extra samples to be used besides the rank `k`. Can improve accuracy. - - """ - self.gensim_model = None - self.num_topics = num_topics - self.id2word = id2word - self.chunksize = chunksize - self.decay = decay - self.onepass = onepass - self.extra_samples = extra_samples - self.power_iters = power_iters - - def fit(self, X, y=None): - """Fit the model according to the given training data. - - Parameters - ---------- - X : {iterable of list of (int, number), scipy.sparse matrix} - A collection of documents in BOW format to be transformed. - - Returns - ------- - :class:`~gensim.sklearn_api.lsimodel.LsiTransformer` - The trained model. - - """ - if sparse.issparse(X): - corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False) - else: - corpus = X - - self.gensim_model = models.LsiModel( - corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, - decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples - ) - return self - - def transform(self, docs): - """Computes the latent factors for `docs`. - - Parameters - ---------- - docs : {iterable of list of (int, number), list of (int, number), scipy.sparse matrix} - Document or collection of documents in BOW format to be transformed. - - Returns - ------- - numpy.ndarray of shape [`len(docs)`, `num_topics`] - Topic distribution matrix. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - # The input as array of array - if isinstance(docs[0], tuple): - docs = [docs] - # returning dense representation for compatibility with sklearn - # but we should go back to sparse representation in the future - distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs] - return np.reshape(np.array(distribution), (len(docs), self.num_topics)) - - def partial_fit(self, X): - """Train model over a potentially incomplete set of documents. - - This method can be used in two ways: - 1. On an unfitted model in which case the model is initialized and trained on `X`. - 2. On an already fitted model in which case the model is **further** trained on `X`. - - Parameters - ---------- - X : {iterable of list of (int, number), scipy.sparse matrix} - Stream of document vectors or sparse matrix of shape: [`num_documents`, `num_terms`]. - - Returns - ------- - :class:`~gensim.sklearn_api.lsimodel.LsiTransformer` - The trained model. - - """ - if sparse.issparse(X): - X = matutils.Sparse2Corpus(sparse=X, documents_columns=False) - - if self.gensim_model is None: - self.gensim_model = models.LsiModel( - num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, decay=self.decay, - onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples - ) - - self.gensim_model.add_documents(corpus=X) - return self diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py deleted file mode 100644 index f3757c91bc..0000000000 --- a/gensim/sklearn_api/phrases.py +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2011 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit learn interface for `gensim.models.phrases.Phrases`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.sklearn_api.phrases import PhrasesTransformer - >>> - >>> # Create the model. Make sure no term is ignored and combinations seen 3+ times are captured. - >>> m = PhrasesTransformer(min_count=1, threshold=3) - >>> texts = [ - ... ['I', 'love', 'computer', 'science'], - ... ['computer', 'science', 'is', 'my', 'passion'], - ... ['I', 'studied', 'computer', 'science'] - ... ] - >>> - >>> # Use sklearn fit_transform to see the transformation. - >>> # Since computer and science were seen together 3+ times they are considered a phrase. - >>> assert ['I', 'love', 'computer_science'] == m.fit_transform(texts)[0] - -""" - -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim import models -from gensim.models.phrases import FrozenPhrases, ENGLISH_CONNECTOR_WORDS # noqa:F401 - - -class PhrasesTransformer(TransformerMixin, BaseEstimator): - """Base Phrases module, wraps :class:`~gensim.models.phrases.Phrases`. - - For more information, please have a look to `Mikolov, et. al: "Distributed Representations - of Words and Phrases and their Compositionality" `_ and - `Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation Extraction" - `_. - - """ - def __init__( - self, min_count=5, threshold=10.0, max_vocab_size=40000000, - delimiter='_', progress_per=10000, scoring='default', connector_words=frozenset(), - ): - """ - - Parameters - ---------- - min_count : int, optional - Terms with a count lower than this will be ignored - threshold : float, optional - Only phrases scoring above this will be accepted, see `scoring` below. - max_vocab_size : int, optional - Maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control. - The default of 40M needs about 3.6GB of RAM. - delimiter : str, optional - Character used to join collocation tokens (e.g. '_'). - progress_per : int, optional - Training will report to the logger every that many phrases are learned. - scoring : str or function, optional - Specifies how potential phrases are scored for comparison to the `threshold` - setting. `scoring` can be set with either a string that refers to a built-in scoring function, - or with a function with the expected parameter names. Two built-in scoring functions are available - by setting `scoring` to a string: - - * 'default': `Mikolov, et. al: "Distributed Representations of Words and Phrases - and their Compositionality" `_. - * 'npmi': Explained in `Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation - Extraction" `_. - - 'npmi' is more robust when dealing with common words that form part of common bigrams, and - ranges from -1 to 1, but is slower to calculate than the default. - - To use a custom scoring function, create a function with the following parameters and set the `scoring` - parameter to the custom function, see :func:`~gensim.models.phrases.original_scorer` as example. - You must define all the parameters (but can use only part of it): - - * worda_count: number of occurrences in `sentences` of the first token in the phrase being scored - * wordb_count: number of occurrences in `sentences` of the second token in the phrase being scored - * bigram_count: number of occurrences in `sentences` of the phrase being scored - * len_vocab: the number of unique tokens in `sentences` - * min_count: the `min_count` setting of the Phrases class - * corpus_word_count: the total number of (non-unique) tokens in `sentences` - - A scoring function without any of these parameters (even if the parameters are not used) will - raise a ValueError on initialization of the Phrases class. The scoring function must be pickleable. - connector_words : set of str, optional - Set of words that may be included within a phrase, without affecting its scoring. - No phrase can start nor end with a connector word; a phrase may contain any number of - connector words in the middle. - - **If your texts are in English, set ``connector_words=phrases.ENGLISH_CONNECTOR_WORDS``.** - This will cause phrases to include common English articles, prepositions and - conjuctions, such as `bank_of_america` or `eye_of_the_beholder`. - - For other languages or specific applications domains, use custom ``connector_words`` - that make sense there: ``connector_words=frozenset("der die das".split())`` etc. - - """ - self.gensim_model = None - self.phraser = None - self.min_count = min_count - self.threshold = threshold - self.max_vocab_size = max_vocab_size - self.delimiter = delimiter - self.progress_per = progress_per - self.scoring = scoring - self.connector_words = connector_words - - def __setstate__(self, state): - self.__dict__ = state - self.connector_words = frozenset() - self.phraser = None - - def fit(self, X, y=None): - """Fit the model according to the given training data. - - Parameters - ---------- - X : iterable of list of str - Sequence of sentences to be used for training the model. - - Returns - ------- - :class:`~gensim.sklearn_api.phrases.PhrasesTransformer` - The trained model. - - """ - self.gensim_model = models.Phrases( - sentences=X, min_count=self.min_count, threshold=self.threshold, - max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, - progress_per=self.progress_per, scoring=self.scoring, connector_words=self.connector_words, - ) - self.phraser = FrozenPhrases(self.gensim_model) - return self - - def transform(self, docs): - """Transform the input documents into phrase tokens. - - Words in the sentence will be joined by `self.delimiter`. - - Parameters - ---------- - docs : {iterable of list of str, list of str} - Sequence of documents to be used transformed. - - Returns - ------- - iterable of str - Phrase representation for each of the input sentences. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - if self.phraser is None: - self.phraser = FrozenPhrases(self.gensim_model) - - # input as python lists - if isinstance(docs[0], str): - docs = [docs] - - return [self.phraser[doc] for doc in docs] - - def partial_fit(self, X): - """Train model over a potentially incomplete set of sentences. - - This method can be used in two ways: - 1. On an unfitted model in which case the model is initialized and trained on `X`. - 2. On an already fitted model in which case the X sentences are **added** to the vocabulary. - - Parameters - ---------- - X : iterable of list of str - Sequence of sentences to be used for training the model. - - Returns - ------- - :class:`~gensim.sklearn_api.phrases.PhrasesTransformer` - The trained model. - - """ - if self.gensim_model is None: - self.gensim_model = models.Phrases( - sentences=X, min_count=self.min_count, threshold=self.threshold, - max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, - progress_per=self.progress_per, scoring=self.scoring, connector_words=self.connector_words, - ) - - self.gensim_model.add_vocab(X) - self.phraser = FrozenPhrases(self.gensim_model) - return self diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py deleted file mode 100644 index e676c1b3fa..0000000000 --- a/gensim/sklearn_api/rpmodel.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Chinmaya Pancholi -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit learn interface for :class:`~gensim.models.rpmodel.RpModel`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.sklearn_api.rpmodel import RpTransformer - >>> from gensim.test.utils import common_dictionary, common_corpus - >>> - >>> # Initialize and fit the model. - >>> model = RpTransformer(id2word=common_dictionary).fit(common_corpus) - >>> - >>> # Use the trained model to transform a document. - >>> result = model.transform(common_corpus[3]) - -""" - -import numpy as np -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim import models -from gensim import matutils - - -class RpTransformer(TransformerMixin, BaseEstimator): - """Base Word2Vec module, wraps :class:`~gensim.models.rpmodel.RpModel`. - - For more information please have a look to `Random projection `_. - - """ - def __init__(self, id2word=None, num_topics=300): - """ - - Parameters - ---------- - id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional - Mapping `token_id` -> `token`, will be determined from corpus if `id2word == None`. - num_topics : int, optional - Number of dimensions. - - """ - self.gensim_model = None - self.id2word = id2word - self.num_topics = num_topics - - def fit(self, X, y=None): - """Fit the model according to the given training data. - - Parameters - ---------- - X : iterable of list of (int, number) - Input corpus in BOW format. - - Returns - ------- - :class:`~gensim.sklearn_api.rpmodel.RpTransformer` - The trained model. - - """ - self.gensim_model = models.RpModel(corpus=X, id2word=self.id2word, num_topics=self.num_topics) - return self - - def transform(self, docs): - """Find the Random Projection factors for `docs`. - - Parameters - ---------- - docs : {iterable of iterable of (int, int), list of (int, number)} - Document or documents to be transformed in BOW format. - - Returns - ------- - numpy.ndarray of shape [`len(docs)`, `num_topics`] - RP representation for each input document. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - # The input as array of array - if isinstance(docs[0], tuple): - docs = [docs] - # returning dense representation for compatibility with sklearn - # but we should go back to sparse representation in the future - presentation = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs] - return np.reshape(np.array(presentation), (len(docs), self.num_topics)) diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py deleted file mode 100644 index 8d982bfa9c..0000000000 --- a/gensim/sklearn_api/text2bow.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2011 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit learn interface for :class:`~gensim.corpora.dictionary.Dictionary`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.sklearn_api import Text2BowTransformer - >>> - >>> # Get a corpus as an iterable of unicode strings. - >>> texts = [u'complier system computer', u'loading computer system'] - >>> - >>> # Create a transformer.. - >>> model = Text2BowTransformer() - >>> - >>> # Use sklearn-style `fit_transform` to get the BOW representation of each document. - >>> model.fit_transform(texts) - [[(0, 1), (1, 1), (2, 1)], [(1, 1), (2, 1), (3, 1)]] - -""" - -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim.corpora import Dictionary -from gensim.utils import tokenize - - -class Text2BowTransformer(TransformerMixin, BaseEstimator): - """Base Text2Bow module , wraps :class:`~gensim.corpora.dictionary.Dictionary`. - - For more information please have a look to `Bag-of-words model `_. - - """ - def __init__(self, prune_at=2000000, tokenizer=tokenize): - """ - Parameters - ---------- - prune_at : int, optional - Total number of unique words. Dictionary will keep not more than `prune_at` words. - tokenizer : callable (str -> list of str), optional - A callable to split a document into a list of each terms, default is :func:`gensim.utils.tokenize`. - - """ - self.gensim_model = None - self.prune_at = prune_at - self.tokenizer = tokenizer - - def fit(self, X, y=None): - """Fit the model according to the given training data. - - Parameters - ---------- - X : iterable of str - A collection of documents used for training the model. - - Returns - ------- - :class:`~gensim.sklearn_api.text2bow.Text2BowTransformer` - The trained model. - - """ - tokenized_docs = [list(self.tokenizer(x)) for x in X] - self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at) - return self - - def transform(self, docs): - """Get the BOW format for the `docs`. - - Parameters - ---------- - docs : {iterable of str, str} - A collection of documents to be transformed. - - Returns - ------- - iterable of list (int, int) 2-tuples. - The BOW representation of each document. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - # input as python lists - if isinstance(docs, str): - docs = [docs] - tokenized_docs = (list(self.tokenizer(doc)) for doc in docs) - return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs] - - def partial_fit(self, X): - """Train model over a potentially incomplete set of documents. - - This method can be used in two ways: - 1. On an unfitted model in which case the dictionary is initialized and trained on `X`. - 2. On an already fitted model in which case the dictionary is **expanded** by `X`. - - Parameters - ---------- - X : iterable of str - A collection of documents used to train the model. - - Returns - ------- - :class:`~gensim.sklearn_api.text2bow.Text2BowTransformer` - The trained model. - - """ - if self.gensim_model is None: - self.gensim_model = Dictionary(prune_at=self.prune_at) - - tokenized_docs = [list(self.tokenizer(x)) for x in X] - self.gensim_model.add_documents(tokenized_docs) - return self diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py deleted file mode 100644 index 8834a31298..0000000000 --- a/gensim/sklearn_api/tfidf.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2011 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit-learn interface for :class:`~gensim.models.tfidfmodel.TfidfModel`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.test.utils import common_corpus, common_dictionary - >>> from gensim.sklearn_api import TfIdfTransformer - >>> - >>> # Transform the word counts inversely to their global frequency using the sklearn interface. - >>> model = TfIdfTransformer(dictionary=common_dictionary) - >>> tfidf_corpus = model.fit_transform(common_corpus) - -""" -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim.models import TfidfModel -import gensim - - -class TfIdfTransformer(TransformerMixin, BaseEstimator): - """Base TfIdf module, wraps :class:`~gensim.models.tfidfmodel.TfidfModel`. - - For more information see `tf-idf `_. - - """ - def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, - wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="nfc", - pivot=None, slope=0.65): - """ - - Parameters - ---------- - - id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional - Mapping from int id to word token, that was used for converting input data to bag of words format. - dictionary : :class:`~gensim.corpora.Dictionary`, optional - If specified it will be used to directly construct the inverse document frequency mapping. - wlocals : function, optional - Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity` which does nothing. - Other options include :func:`math.sqrt`, :func:`math.log1p`, etc. - wglobal : function, optional - Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`. - normalize : bool, optional - It dictates how the final transformed vectors will be normalized. `normalize=True` means set to unit length - (default); `False` means don't normalize. You can also set `normalize` to your own function that accepts - and returns a sparse vector. - smartirs : str, optional - SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System, - a mnemonic scheme for denoting tf-idf weighting variants in the vector space model. - The mnemonic for representing a combination of weights takes the form XYZ, - for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector. - - local_letter : str - Term frequency weighing, one of: - * `b` - binary, - * `t` or `n` - raw, - * `a` - augmented, - * `l` - logarithm, - * `d` - double logarithm, - * `L` - log average. - global_letter : str - Document frequency weighting, one of: - * `x` or `n` - none, - * `f` - idf, - * `t` - zero-corrected idf, - * `p` - probabilistic idf. - normalization_letter : str - Document normalization, one of: - * `x` or `n` - none, - * `c` - cosine, - * `u` - pivoted unique, - * `b` - pivoted character length. - - Default is `nfc`. - For more info, visit `"Wikipedia" `_. - pivot : float, optional - It is the point around which the regular normalization curve is `tilted` to get the new pivoted - normalization curve. In the paper `Amit Singhal, Chris Buckley, Mandar Mitra: - "Pivoted Document Length Normalization" `_ it is the point where the - retrieval and relevance curves intersect. - This parameter along with `slope` is used for pivoted document length normalization. - When `pivot` is None, `smartirs` specifies the pivoted unique document normalization scheme, and either - `corpus` or `dictionary` are specified, then the pivot will be determined automatically. Otherwise, no - pivoted document length normalization is applied. - slope : float, optional - It is the parameter required by pivoted document length normalization which determines the slope to which - the `old normalization` can be tilted. This parameter only works when pivot is defined by user and is not - None. - - See Also - -------- - ~gensim.models.tfidfmodel.TfidfModel : Class that also uses the SMART scheme. - ~gensim.models.tfidfmodel.resolve_weights : Function that also uses the SMART scheme. - - """ - self.gensim_model = None - self.id2word = id2word - self.dictionary = dictionary - self.wlocal = wlocal - self.wglobal = wglobal - self.normalize = normalize - self.smartirs = smartirs - self.slope = slope - self.pivot = pivot - - def fit(self, X, y=None): - """Fit the model from the given training data. - - Parameters - ---------- - X : iterable of iterable of (int, int) - Input corpus - y : None - Ignored. TF-IDF is an unsupervised model. - - Returns - ------- - :class:`~gensim.sklearn_api.tfidf.TfIdfTransformer` - The trained model. - - """ - self.gensim_model = TfidfModel( - corpus=X, id2word=self.id2word, dictionary=self.dictionary, wlocal=self.wlocal, - wglobal=self.wglobal, normalize=self.normalize, smartirs=self.smartirs, - pivot=self.pivot, slope=self.slope, - ) - return self - - def transform(self, docs): - """Get the tf-idf scores for `docs` in a bag-of-words representation. - - Parameters - ---------- - docs: {iterable of list of (int, number)} - Document or corpus in bag-of-words format. - - Returns - ------- - iterable of list (int, float) 2-tuples. - The bag-of-words representation of each input document. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - # Is the input a single document? - if isinstance(docs[0], tuple): - docs = [docs] # Yes => convert it to a corpus (of 1 document). - return [self.gensim_model[doc] for doc in docs] diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py deleted file mode 100644 index 8c0bd932a1..0000000000 --- a/gensim/sklearn_api/w2vmodel.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Chinmaya Pancholi -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Scikit learn interface for :class:`~gensim.models.word2vec.Word2Vec`. - -Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. - - -Examples --------- -.. sourcecode:: pycon - - >>> from gensim.test.utils import common_texts - >>> from gensim.sklearn_api import W2VTransformer - >>> - >>> # Create a model to represent each word by a 10 dimensional vector. - >>> model = W2VTransformer(vector_size=10, min_count=1, seed=1) - >>> - >>> # What is the vector representation of the word 'graph'? - >>> wordvecs = model.fit(common_texts).transform(['graph', 'system']) - >>> assert wordvecs.shape == (2, 10) - -""" - -import numpy as np - -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.exceptions import NotFittedError - -from gensim import models - - -class W2VTransformer(TransformerMixin, BaseEstimator): - """Base Word2Vec module, wraps :class:`~gensim.models.word2vec.Word2Vec`. - - For more information please have a look to `Tomas Mikolov, Kai Chen, Greg Corrado, Jeffrey Dean: "Efficient - Estimation of Word Representations in Vector Space" `_. - - """ - def __init__(self, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, - workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=10000): - """ - - Parameters - ---------- - vector_size : int - Dimensionality of the feature vectors. - alpha : float - The initial learning rate. - window : int - The maximum distance between the current and predicted word within a sentence. - min_count : int - Ignores all words with total frequency lower than this. - max_vocab_size : int - Limits the RAM during vocabulary building; if there are more unique - words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. - Set to `None` for no limit. - sample : float - The threshold for configuring which higher-frequency words are randomly downsampled, - useful range is (0, 1e-5). - seed : int - Seed for the random number generator. Initial vectors for each word are seeded with a hash of - the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, - you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter - from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires - use of the `PYTHONHASHSEED` environment variable to control hash randomization). - workers : int - Use these many worker threads to train the model (=faster training with multicore machines). - min_alpha : float - Learning rate will linearly drop to `min_alpha` as training progresses. - sg : int {1, 0} - Defines the training algorithm. If 1, CBOW is used, otherwise, skip-gram is employed. - hs : int {1,0} - If 1, hierarchical softmax will be used for model training. - If set to 0, and `negative` is non-zero, negative sampling will be used. - negative : int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" - should be drawn (usually between 5-20). - If set to 0, no negative sampling is used. - cbow_mean : int {1,0} - If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. - hashfxn : callable (object -> int), optional - A hashing function. Used to create an initial random reproducible vector by hashing the random seed. - epochs : int - Number of iterations (epochs) over the corpus. - null_word : int {1, 0} - If 1, a null pseudo-word will be created for padding when using concatenative L1 (run-of-words) - trim_rule : function - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. - sorted_vocab : int {1,0} - If 1, sort the vocabulary by descending frequency before assigning word indexes. - batch_words : int - Target size (in words) for batches of examples passed to worker threads (and - thus cython routines).(Larger batches will be passed if individual - texts are longer than 10000 words, but the standard cython code truncates to that maximum.) - - """ - self.gensim_model = None - self.vector_size = vector_size - self.alpha = alpha - self.window = window - self.min_count = min_count - self.max_vocab_size = max_vocab_size - self.sample = sample - self.seed = seed - self.workers = workers - self.min_alpha = min_alpha - self.sg = sg - self.hs = hs - self.negative = negative - self.cbow_mean = int(cbow_mean) - self.hashfxn = hashfxn - self.epochs = epochs - self.null_word = null_word - self.trim_rule = trim_rule - self.sorted_vocab = sorted_vocab - self.batch_words = batch_words - - def fit(self, X, y=None): - """Fit the model according to the given training data. - - Parameters - ---------- - X : iterable of iterables of str - The input corpus. X can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - - Returns - ------- - :class:`~gensim.sklearn_api.w2vmodel.W2VTransformer` - The trained model. - - """ - self.gensim_model = models.Word2Vec( - sentences=X, vector_size=self.vector_size, alpha=self.alpha, - window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, - sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, - sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, - hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule, - sorted_vocab=self.sorted_vocab, batch_words=self.batch_words - ) - return self - - def transform(self, words): - """Get the word vectors the input words. - - Parameters - ---------- - words : {iterable of str, str} - Word or a collection of words to be transformed. - - Returns - ------- - np.ndarray of shape [`len(words)`, `vector_size`] - A 2D array where each row is the vector of one word. - - """ - if self.gensim_model is None: - raise NotFittedError( - "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - # The input as array of array - if isinstance(words, str): - words = [words] - vectors = [self.gensim_model.wv[word] for word in words] - return np.reshape(np.array(vectors), (len(words), self.vector_size)) - - def partial_fit(self, X): - raise NotImplementedError( - "'partial_fit' has not been implemented for W2VTransformer. " - "However, the model can be updated with a fixed vocabulary using Gensim API call." - ) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py deleted file mode 100644 index 523e658e66..0000000000 --- a/gensim/test/test_coherencemodel.py +++ /dev/null @@ -1,364 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated tests for checking transformation algorithms (the models package). -""" - -import logging -import os -import unittest -from unittest import SkipTest -import multiprocessing as mp -from functools import partial - -import numpy as np -from gensim.matutils import argsort -from gensim.models.coherencemodel import CoherenceModel, BOOLEAN_DOCUMENT_BASED -from gensim.models.ldamodel import LdaModel -from gensim.models.wrappers import LdaMallet -from gensim.models.wrappers import LdaVowpalWabbit -from gensim.test.utils import get_tmpfile, common_texts, common_dictionary, common_corpus - - -class TestCoherenceModel(unittest.TestCase): - - # set up vars used in testing ("Deerwester" from the web tutorial) - texts = common_texts - dictionary = common_dictionary - corpus = common_corpus - - def setUp(self): - # Suppose given below are the topics which two different LdaModels come up with. - # `topics1` is clearly better as it has a clear distinction between system-human - # interaction and graphs. Hence both the coherence measures for `topics1` should be - # greater. - self.topics1 = [ - ['human', 'computer', 'system', 'interface'], - ['graph', 'minors', 'trees', 'eps'] - ] - self.topics2 = [ - ['user', 'graph', 'minors', 'system'], - ['time', 'graph', 'survey', 'minors'] - ] - self.ldamodel = LdaModel( - corpus=self.corpus, id2word=self.dictionary, num_topics=2, - passes=0, iterations=0 - ) - - mallet_home = os.environ.get('MALLET_HOME', None) - self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None - if self.mallet_path: - self.malletmodel = LdaMallet( - mallet_path=self.mallet_path, corpus=self.corpus, - id2word=self.dictionary, num_topics=2, iterations=0 - ) - - vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None) - if not vw_path: - logging.info( - "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model" - ) - self.vw_path = None - else: - self.vw_path = vw_path - self.vwmodel = LdaVowpalWabbit( - self.vw_path, corpus=self.corpus, id2word=self.dictionary, - num_topics=2, passes=0 - ) - - def check_coherence_measure(self, coherence): - """Check provided topic coherence algorithm on given topics""" - if coherence in BOOLEAN_DOCUMENT_BASED: - kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence=coherence) - else: - kwargs = dict(texts=self.texts, dictionary=self.dictionary, coherence=coherence) - - cm1 = CoherenceModel(topics=self.topics1, **kwargs) - cm2 = CoherenceModel(topics=self.topics2, **kwargs) - self.assertGreater(cm1.get_coherence(), cm2.get_coherence()) - - def testUMass(self): - """Test U_Mass topic coherence algorithm on given topics""" - self.check_coherence_measure('u_mass') - - def testCv(self): - """Test C_v topic coherence algorithm on given topics""" - self.check_coherence_measure('c_v') - - def testCuci(self): - """Test C_uci topic coherence algorithm on given topics""" - self.check_coherence_measure('c_uci') - - def testCnpmi(self): - """Test C_npmi topic coherence algorithm on given topics""" - self.check_coherence_measure('c_npmi') - - def testUMassLdaModel(self): - """Perform sanity check to see if u_mass coherence works with LDA Model""" - # Note that this is just a sanity check because LDA does not guarantee a better coherence - # value on the topics if iterations are increased. This can be seen here: - # https://gist.github.com/dsquareindia/60fd9ab65b673711c3fa00509287ddde - CoherenceModel(model=self.ldamodel, corpus=self.corpus, coherence='u_mass') - - def testCvLdaModel(self): - """Perform sanity check to see if c_v coherence works with LDA Model""" - CoherenceModel(model=self.ldamodel, texts=self.texts, coherence='c_v') - - def testCw2vLdaModel(self): - """Perform sanity check to see if c_w2v coherence works with LDAModel.""" - CoherenceModel(model=self.ldamodel, texts=self.texts, coherence='c_w2v') - - def testCuciLdaModel(self): - """Perform sanity check to see if c_uci coherence works with LDA Model""" - CoherenceModel(model=self.ldamodel, texts=self.texts, coherence='c_uci') - - def testCnpmiLdaModel(self): - """Perform sanity check to see if c_npmi coherence works with LDA Model""" - CoherenceModel(model=self.ldamodel, texts=self.texts, coherence='c_npmi') - - def testUMassMalletModel(self): - """Perform sanity check to see if u_mass coherence works with LDA Mallet gensim wrapper""" - self._check_for_mallet() - CoherenceModel(model=self.malletmodel, corpus=self.corpus, coherence='u_mass') - - def _check_for_mallet(self): - if not self.mallet_path: - raise SkipTest("Mallet not installed") - - def testCvMalletModel(self): - """Perform sanity check to see if c_v coherence works with LDA Mallet gensim wrapper""" - self._check_for_mallet() - CoherenceModel(model=self.malletmodel, texts=self.texts, coherence='c_v') - - def testCw2vMalletModel(self): - """Perform sanity check to see if c_w2v coherence works with LDA Mallet gensim wrapper""" - self._check_for_mallet() - CoherenceModel(model=self.malletmodel, texts=self.texts, coherence='c_w2v') - - def testCuciMalletModel(self): - """Perform sanity check to see if c_uci coherence works with LDA Mallet gensim wrapper""" - self._check_for_mallet() - CoherenceModel(model=self.malletmodel, texts=self.texts, coherence='c_uci') - - def testCnpmiMalletModel(self): - """Perform sanity check to see if c_npmi coherence works with LDA Mallet gensim wrapper""" - self._check_for_mallet() - CoherenceModel(model=self.malletmodel, texts=self.texts, coherence='c_npmi') - - def testUMassVWModel(self): - """Perform sanity check to see if u_mass coherence works with LDA VW gensim wrapper""" - self._check_for_vw() - CoherenceModel(model=self.vwmodel, corpus=self.corpus, coherence='u_mass') - - def _check_for_vw(self): - if not self.vw_path: - raise SkipTest("Vowpal Wabbit not installed") - - def testCvVWModel(self): - """Perform sanity check to see if c_v coherence works with LDA VW gensim wrapper""" - self._check_for_vw() - CoherenceModel(model=self.vwmodel, texts=self.texts, coherence='c_v') - - def testCw2vVWModel(self): - """Perform sanity check to see if c_w2v coherence works with LDA VW gensim wrapper""" - self._check_for_vw() - CoherenceModel(model=self.vwmodel, texts=self.texts, coherence='c_w2v') - - def testCuciVWModel(self): - """Perform sanity check to see if c_uci coherence works with LDA VW gensim wrapper""" - self._check_for_vw() - CoherenceModel(model=self.vwmodel, texts=self.texts, coherence='c_uci') - - def testCnpmiVWModel(self): - """Perform sanity check to see if c_npmi coherence works with LDA VW gensim wrapper""" - self._check_for_vw() - CoherenceModel(model=self.vwmodel, texts=self.texts, coherence='c_npmi') - - def testErrors(self): - """Test if errors are raised on bad input""" - # not providing dictionary - self.assertRaises( - ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus, - coherence='u_mass' - ) - # not providing texts for c_v and instead providing corpus - self.assertRaises( - ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus, - dictionary=self.dictionary, coherence='c_v' - ) - # not providing corpus or texts for u_mass - self.assertRaises( - ValueError, CoherenceModel, topics=self.topics1, dictionary=self.dictionary, - coherence='u_mass' - ) - - def testProcesses(self): - get_model = partial(CoherenceModel, - topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' - ) - - model, used_cpus = get_model(), mp.cpu_count() - 1 - self.assertEqual(model.processes, used_cpus) - for p in range(-2, 1): - self.assertEqual(get_model(processes=p).processes, used_cpus) - - for p in range(1, 4): - self.assertEqual(get_model(processes=p).processes, p) - - def testPersistence(self): - fname = get_tmpfile('gensim_models_coherence.tst') - model = CoherenceModel( - topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' - ) - model.save(fname) - model2 = CoherenceModel.load(fname) - self.assertTrue(model.get_coherence() == model2.get_coherence()) - - def testPersistenceCompressed(self): - fname = get_tmpfile('gensim_models_coherence.tst.gz') - model = CoherenceModel( - topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' - ) - model.save(fname) - model2 = CoherenceModel.load(fname) - self.assertTrue(model.get_coherence() == model2.get_coherence()) - - def testPersistenceAfterProbabilityEstimationUsingCorpus(self): - fname = get_tmpfile('gensim_similarities.tst.pkl') - model = CoherenceModel( - topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' - ) - model.estimate_probabilities() - model.save(fname) - model2 = CoherenceModel.load(fname) - self.assertIsNotNone(model2._accumulator) - self.assertTrue(model.get_coherence() == model2.get_coherence()) - - def testPersistenceAfterProbabilityEstimationUsingTexts(self): - fname = get_tmpfile('gensim_similarities.tst.pkl') - model = CoherenceModel( - topics=self.topics1, texts=self.texts, dictionary=self.dictionary, coherence='c_v' - ) - model.estimate_probabilities() - model.save(fname) - model2 = CoherenceModel.load(fname) - self.assertIsNotNone(model2._accumulator) - self.assertTrue(model.get_coherence() == model2.get_coherence()) - - def testAccumulatorCachingSameSizeTopics(self): - kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') - cm1 = CoherenceModel(topics=self.topics1, **kwargs) - cm1.estimate_probabilities() - accumulator = cm1._accumulator - self.assertIsNotNone(accumulator) - cm1.topics = self.topics1 - self.assertEqual(accumulator, cm1._accumulator) - cm1.topics = self.topics2 - self.assertEqual(None, cm1._accumulator) - - def testAccumulatorCachingTopicSubsets(self): - kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') - cm1 = CoherenceModel(topics=self.topics1, **kwargs) - cm1.estimate_probabilities() - accumulator = cm1._accumulator - self.assertIsNotNone(accumulator) - cm1.topics = [t[:2] for t in self.topics1] - self.assertEqual(accumulator, cm1._accumulator) - cm1.topics = self.topics1 - self.assertEqual(accumulator, cm1._accumulator) - - def testAccumulatorCachingWithModelSetting(self): - kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') - cm1 = CoherenceModel(topics=self.topics1, **kwargs) - cm1.estimate_probabilities() - self.assertIsNotNone(cm1._accumulator) - cm1.model = self.ldamodel - topics = [] - for topic in self.ldamodel.state.get_lambda(): - bestn = argsort(topic, topn=cm1.topn, reverse=True) - topics.append(bestn) - self.assertTrue(np.array_equal(topics, cm1.topics)) - self.assertIsNone(cm1._accumulator) - - def testAccumulatorCachingWithTopnSettingGivenTopics(self): - kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, topn=5, coherence='u_mass') - cm1 = CoherenceModel(topics=self.topics1, **kwargs) - cm1.estimate_probabilities() - self.assertIsNotNone(cm1._accumulator) - - accumulator = cm1._accumulator - topics_before = cm1._topics - cm1.topn = 3 - self.assertEqual(accumulator, cm1._accumulator) - self.assertEqual(3, len(cm1.topics[0])) - self.assertEqual(topics_before, cm1._topics) - - # Topics should not have been truncated, so topn settings below 5 should work - cm1.topn = 4 - self.assertEqual(accumulator, cm1._accumulator) - self.assertEqual(4, len(cm1.topics[0])) - self.assertEqual(topics_before, cm1._topics) - - with self.assertRaises(ValueError): - cm1.topn = 6 # can't expand topics any further without model - - def testAccumulatorCachingWithTopnSettingGivenModel(self): - kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, topn=5, coherence='u_mass') - cm1 = CoherenceModel(model=self.ldamodel, **kwargs) - cm1.estimate_probabilities() - self.assertIsNotNone(cm1._accumulator) - - accumulator = cm1._accumulator - topics_before = cm1._topics - cm1.topn = 3 - self.assertEqual(accumulator, cm1._accumulator) - self.assertEqual(3, len(cm1.topics[0])) - self.assertEqual(topics_before, cm1._topics) - - cm1.topn = 6 # should be able to expand given the model - self.assertEqual(6, len(cm1.topics[0])) - - def testCompareCoherenceForTopics(self): - topics = [self.topics1, self.topics2] - cm = CoherenceModel.for_topics( - topics, dictionary=self.dictionary, texts=self.texts, coherence='c_v') - self.assertIsNotNone(cm._accumulator) - - # Accumulator should have all relevant IDs. - for topic_list in topics: - cm.topics = topic_list - self.assertIsNotNone(cm._accumulator) - - (coherence_topics1, coherence1), (coherence_topics2, coherence2) = \ - cm.compare_model_topics(topics) - - self.assertAlmostEqual(np.mean(coherence_topics1), coherence1, 4) - self.assertAlmostEqual(np.mean(coherence_topics2), coherence2, 4) - self.assertGreater(coherence1, coherence2) - - def testCompareCoherenceForModels(self): - models = [self.ldamodel, self.ldamodel] - cm = CoherenceModel.for_models( - models, dictionary=self.dictionary, texts=self.texts, coherence='c_v') - self.assertIsNotNone(cm._accumulator) - - # Accumulator should have all relevant IDs. - for model in models: - cm.model = model - self.assertIsNotNone(cm._accumulator) - - (coherence_topics1, coherence1), (coherence_topics2, coherence2) = \ - cm.compare_models(models) - - self.assertAlmostEqual(np.mean(coherence_topics1), coherence1, 4) - self.assertAlmostEqual(np.mean(coherence_topics2), coherence2, 4) - self.assertAlmostEqual(coherence1, coherence2, places=4) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_d2vmodel.py b/gensim/test/test_d2vmodel.py deleted file mode 100644 index aa24203277..0000000000 --- a/gensim/test/test_d2vmodel.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated tests for checking D2VTransformer class. -""" - -import unittest -import logging -from gensim.sklearn_api import D2VTransformer -from gensim.test.utils import common_texts - - -class IteratorForIterable: - """Iterator capable of folding into list.""" - def __init__(self, iterable): - self._data = iterable - self._index = 0 - - def __next__(self): - if len(self._data) > self._index: - result = self._data[self._index] - self._index += 1 - return result - raise StopIteration - - -class IterableWithoutZeroElement: - """ - Iterable, emulating pandas.Series behaviour without 0-th element. - Equivalent to calling `series.index += 1`. - """ - def __init__(self, data): - self.data = data - - def __getitem__(self, key): - if key == 0: - raise KeyError("Emulation of absence of item with key 0.") - return self.data[key] - - def __iter__(self): - return IteratorForIterable(self.data) - - -class TestD2VTransformer(unittest.TestCase): - def TestWorksWithIterableNotHavingElementWithZeroIndex(self): - a = IterableWithoutZeroElement(common_texts) - transformer = D2VTransformer(min_count=1, vector_size=5) - transformer.fit(a) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py deleted file mode 100644 index 2ad1ccb9c9..0000000000 --- a/gensim/test/test_ldamallet_wrapper.py +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated tests for checking transformation algorithms (the models package). -""" - - -import logging -import unittest -import os -import os.path - -import numpy as np - -from gensim.corpora import mmcorpus, Dictionary -from gensim.models.wrappers import ldamallet -from gensim import matutils -from gensim.utils import simple_preprocess -from gensim.models import ldamodel -from gensim.test import basetmtests -from gensim.test.utils import datapath, get_tmpfile, common_texts -import gensim.downloader as api - -dictionary = Dictionary(common_texts) -corpus = [dictionary.doc2bow(text) for text in common_texts] - - -class TestLdaMallet(unittest.TestCase, basetmtests.TestBaseTopicModel): - def setUp(self): - mallet_home = os.environ.get('MALLET_HOME', None) - self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None - if not self.mallet_path: - raise unittest.SkipTest("MALLET_HOME not specified. Skipping Mallet tests.") - self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) - - # self.model is used in TestBaseTopicModel - self.model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=1) - - def testTransform(self): - if not self.mallet_path: - return - passed = False - for i in range(5): # restart at most 5 times - # create the transformation model - model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200) - # transform one document - doc = list(corpus)[0] - transformed = model[doc] - vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests - expected = [0.49, 0.51] - # must contain the same values, up to re-ordering - passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) - if passed: - break - logging.warning( - "LDA failed to converge on attempt %i (got %s, expected %s)", - i, sorted(vec), sorted(expected) - ) - self.assertTrue(passed) - - def testSparseTransform(self): - if not self.mallet_path: - return - passed = False - for i in range(5): # restart at most 5 times - # create the sparse transformation model with the appropriate topic_threshold - model = ldamallet.LdaMallet( - self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200, topic_threshold=0.5 - ) - # transform one document - doc = list(corpus)[0] - transformed = model[doc] - vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests - expected = [1.0, 0.0] - # must contain the same values, up to re-ordering - passed = np.allclose(sorted(vec), sorted(expected), atol=1e-2) - if passed: - break - logging.warning( - "LDA failed to converge on attempt %i (got %s, expected %s)", - i, sorted(vec), sorted(expected) - ) - self.assertTrue(passed) - - def testMallet2Model(self): - if not self.mallet_path: - return - - tm1 = ldamallet.LdaMallet(self.mallet_path, corpus=corpus, num_topics=2, id2word=dictionary) - tm2 = ldamallet.malletmodel2ldamodel(tm1) - - # set num_topics=-1 to exclude random influence - self.assertEqual(tm1.show_topics(-1, 10), tm2.show_topics(-1, 10)) - - for document in corpus: - element1_1, element1_2 = tm1[document][0] - element2_1, element2_2 = tm2[document][0] - self.assertAlmostEqual(element1_1, element2_1) - self.assertAlmostEqual(element1_2, element2_2, 1) - element1_1, element1_2 = tm1[document][1] - element2_1, element2_2 = tm2[document][1] - self.assertAlmostEqual(element1_1, element2_1) - self.assertAlmostEqual(element1_2, element2_2, 1) - logging.debug('%d %d', element1_1, element2_1) - logging.debug('%d %d', element1_2, element2_2) - logging.debug('%s %s', tm1[document][1], tm2[document][1]) - - def testMallet2ModelOn20NewsGroups(self): - corpus = [simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")] - dictionary = Dictionary(corpus) - - corpus = [dictionary.doc2bow(text) for text in corpus] - - lda_mallet_model = ldamallet.LdaMallet( - self.mallet_path, corpus=corpus, - num_topics=20, id2word=dictionary, iterations=500) - - lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model, iterations=1000) - self.assertEqual(lda_mallet_model.show_topics(20, 50), lda_gensim_model.show_topics(20, 50)) - - def testPersistence(self): - if not self.mallet_path: - return - fname = get_tmpfile('gensim_models_lda_mallet.tst') - model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) - model.save(fname) - model2 = ldamallet.LdaMallet.load(fname) - self.assertEqual(model.num_topics, model2.num_topics) - self.assertTrue(np.allclose(model.word_topics, model2.word_topics)) - tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector - - def testPersistenceCompressed(self): - if not self.mallet_path: - return - fname = get_tmpfile('gensim_models_lda_mallet.tst.gz') - model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) - model.save(fname) - model2 = ldamallet.LdaMallet.load(fname, mmap=None) - self.assertEqual(model.num_topics, model2.num_topics) - self.assertTrue(np.allclose(model.word_topics, model2.word_topics)) - tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector - - def testLargeMmap(self): - if not self.mallet_path: - return - fname = get_tmpfile('gensim_models_lda_mallet.tst') - model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) - - # simulate storing large arrays separately - model.save(fname, sep_limit=0) - - # test loading the large model arrays with mmap - model2 = ldamodel.LdaModel.load(fname, mmap='r') - self.assertEqual(model.num_topics, model2.num_topics) - self.assertTrue(isinstance(model2.word_topics, np.memmap)) - self.assertTrue(np.allclose(model.word_topics, model2.word_topics)) - tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector - - def testLargeMmapCompressed(self): - if not self.mallet_path: - return - fname = get_tmpfile('gensim_models_lda_mallet.tst.gz') - model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) - - # simulate storing large arrays separately - model.save(fname, sep_limit=0) - - # test loading the large model arrays with mmap - self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r') - - def test_random_seed(self): - if not self.mallet_path: - return - - # test that 2 models created with the same random_seed are equal in their topics treatment - SEED = 10 - NUM_TOPICS = 10 - ITER = 500 - - tm1 = ldamallet.LdaMallet( - self.mallet_path, - corpus=corpus, - num_topics=NUM_TOPICS, - id2word=dictionary, - random_seed=SEED, - iterations=ITER, - ) - - tm2 = ldamallet.LdaMallet( - self.mallet_path, - corpus=corpus, - num_topics=NUM_TOPICS, - id2word=dictionary, - random_seed=SEED, - iterations=ITER, - ) - self.assertTrue(np.allclose(tm1.word_topics, tm2.word_topics)) - - for doc in corpus: - tm1_vector = matutils.sparse2full(tm1[doc], NUM_TOPICS) - tm2_vector = matutils.sparse2full(tm2[doc], NUM_TOPICS) - - self.assertTrue(np.allclose(tm1_vector, tm2_vector)) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_ldavowpalwabbit_wrapper.py b/gensim/test/test_ldavowpalwabbit_wrapper.py deleted file mode 100644 index ddec17da23..0000000000 --- a/gensim/test/test_ldavowpalwabbit_wrapper.py +++ /dev/null @@ -1,213 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Tests for Vowpal Wabbit LDA wrapper. - -Will not be run unless the environment variable 'VOWPAL_WABBIT_PATH' is set -and points to the `vw` executable. -""" - - -import logging -import unittest -import os -import os.path -import tempfile -from collections import defaultdict - -from gensim.corpora import Dictionary - -import gensim.models.wrappers.ldavowpalwabbit as ldavowpalwabbit -from gensim.models.wrappers.ldavowpalwabbit import LdaVowpalWabbit -from gensim.test.utils import datapath - - -# set up vars used in testing ("Deerwester" from the web tutorial) -TOPIC_WORDS = [ - 'cat lion leopard mouse jaguar lynx cheetah tiger kitten puppy'.split(), - 'engine car wheel brakes tyre motor suspension cylinder exhaust clutch'.split(), - 'alice bob robert tim sue rachel dave harry alex jim'.split(), - 'c cplusplus go python haskell scala java ruby csharp erlang'.split(), - 'eggs ham mushrooms cereal coffee beans tea juice sausages bacon'.split() -] - - -def get_corpus(): - text_path = datapath('ldavowpalwabbit.txt') - dict_path = datapath('ldavowpalwabbit.dict.txt') - dictionary = Dictionary.load_from_text(dict_path) - with open(text_path) as fhandle: - corpus = [dictionary.doc2bow(line.strip().split()) for line in fhandle] - return corpus, dictionary - - -class TestLdaVowpalWabbit(unittest.TestCase): - def setUp(self): - vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None) - if not vw_path: - msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping tests" - - try: - raise unittest.SkipTest(msg) - except AttributeError: - # couldn't find a way of skipping tests in python 2.6 - self.vw_path = None - - corpus, dictionary = get_corpus() - self.vw_path = vw_path - self.corpus = corpus - self.dictionary = dictionary - - def test_save_load(self): - """Test loading/saving LdaVowpalWabbit model.""" - if not self.vw_path: # for python 2.6 - return - lda = LdaVowpalWabbit( - self.vw_path, corpus=self.corpus, passes=10, chunksize=256, - id2word=self.dictionary, cleanup_files=True, alpha=0.1, - eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1 - ) - - with tempfile.NamedTemporaryFile() as fhandle: - lda.save(fhandle.name) - lda2 = LdaVowpalWabbit.load(fhandle.name) - - # ensure public fields are saved/loaded correctly - saved_fields = [ - lda.alpha, lda.chunksize, lda.cleanup_files, - lda.decay, lda.eta, lda.gamma_threshold, - lda.id2word, lda.num_terms, lda.num_topics, - lda.passes, lda.random_seed, lda.vw_path - ] - loaded_fields = [ - lda2.alpha, lda2.chunksize, lda2.cleanup_files, - lda2.decay, lda2.eta, lda2.gamma_threshold, - lda2.id2word, lda2.num_terms, lda2.num_topics, - lda2.passes, lda2.random_seed, lda2.vw_path - ] - self.assertEqual(saved_fields, loaded_fields) - - # ensure topic matrices are saved/loaded correctly - saved_topics = lda.show_topics(num_topics=5, num_words=10) - loaded_topics = lda2.show_topics(num_topics=5, num_words=10) - self.assertEqual(loaded_topics, saved_topics) - - def test_model_update(self): - """Test updating existing LdaVowpalWabbit model.""" - if not self.vw_path: # for python 2.6 - return - lda = LdaVowpalWabbit( - self.vw_path, corpus=[self.corpus[0]], passes=10, chunksize=256, - id2word=self.dictionary, cleanup_files=True, alpha=0.1, - eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1 - ) - - lda.update(self.corpus[1:]) - result = lda.log_perplexity(self.corpus) - self.assertTrue(result < -1) - self.assertTrue(result > -5) - - def test_perplexity(self): - """Test LdaVowpalWabbit perplexity is within expected range.""" - if not self.vw_path: # for python 2.6 - return - lda = LdaVowpalWabbit( - self.vw_path, corpus=self.corpus, passes=10, chunksize=256, - id2word=self.dictionary, cleanup_files=True, alpha=0.1, - eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1) - - # varies, but should be between -1 and -5 - result = lda.log_perplexity(self.corpus) - self.assertTrue(result < -1) - self.assertTrue(result > -5) - - def test_topic_coherence(self): - """Test LdaVowpalWabbit topic coherence.""" - if not self.vw_path: # for python 2.6 - return - corpus, dictionary = get_corpus() - lda = LdaVowpalWabbit( - self.vw_path, corpus=corpus, passes=10, chunksize=256, - id2word=dictionary, cleanup_files=True, alpha=0.1, - eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1 - ) - lda.print_topics(5, 10) - - # map words in known topic to an ID - topic_map = {} - for i, words in enumerate(TOPIC_WORDS): - topic_map[frozenset(words)] = i - - n_coherent = 0 - for topic_id in range(lda.num_topics): - topic = lda.show_topic(topic_id, topn=20) - - # get all words from LDA topic - topic_words = [w[1] for w in topic] - - # get list of original topics that each word actually belongs to - ids = [] - for word in topic_words: - for src_topic_words, src_topic_id in topic_map.items(): - if word in src_topic_words: - ids.append(src_topic_id) - - # count the number of times each original topic appears - counts = defaultdict(int) - for found_topic_id in ids: - counts[found_topic_id] += 1 - - # if at least 6/10 words assigned to same topic, consider it coherent - max_count = 0 - for count in counts.values(): - max_count = max(max_count, count) - - if max_count >= 6: - n_coherent += 1 - - # not 100% deterministic, but should always get 3+ coherent topics - self.assertTrue(n_coherent >= 3) - - def test_corpus_to_vw(self): - """Test corpus to Vowpal Wabbit format conversion.""" - if not self.vw_path: # for python 2.6 - return - corpus = [ - [(0, 5), (7, 1), (5, 3), (0, 2)], - [(7, 2), (2, 1), (3, 11)], - [(1, 1)], - [], - [(5, 2), (0, 1)] - ] - expected = """ -| 0:5 7:1 5:3 0:2 -| 7:2 2:1 3:11 -| 1:1 -| -| 5:2 0:1 -""".strip() - result = '\n'.join(ldavowpalwabbit.corpus_to_vw(corpus)) - self.assertEqual(result, expected) - - def testvwmodel2ldamodel(self): - """Test copying of VWModel to LdaModel""" - if not self.vw_path: - return - tm1 = LdaVowpalWabbit(vw_path=self.vw_path, corpus=self.corpus, num_topics=2, id2word=self.dictionary) - tm2 = ldavowpalwabbit.vwmodel2ldamodel(tm1) - for document in self.corpus: - element1_1, element1_2 = tm1[document][0] - element2_1, element2_2 = tm2[document][0] - self.assertAlmostEqual(element1_1, element2_1) - self.assertAlmostEqual(element1_2, element2_2, 5) - logging.debug('%d %d', element1_1, element2_1) - logging.debug('%d %d', element1_2, element2_2) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py deleted file mode 100644 index 8dcb1b7205..0000000000 --- a/gensim/test/test_sklearn_api.py +++ /dev/null @@ -1,1397 +0,0 @@ -import os -import logging -import unittest -import numpy -import codecs -import pickle - -from scipy import sparse -try: - from sklearn.pipeline import Pipeline - from sklearn import linear_model, cluster - from sklearn.exceptions import NotFittedError -except ImportError: - raise unittest.SkipTest("Test requires scikit-learn to be installed, which is not available") - -from gensim.sklearn_api.ftmodel import FTTransformer -from gensim.sklearn_api.rpmodel import RpTransformer -from gensim.sklearn_api.ldamodel import LdaTransformer -from gensim.sklearn_api.lsimodel import LsiTransformer -from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer -from gensim.sklearn_api.w2vmodel import W2VTransformer -from gensim.sklearn_api.atmodel import AuthorTopicTransformer -from gensim.sklearn_api.d2vmodel import D2VTransformer -from gensim.sklearn_api.text2bow import Text2BowTransformer -from gensim.sklearn_api.tfidf import TfIdfTransformer -from gensim.sklearn_api.hdp import HdpTransformer -from gensim.sklearn_api.phrases import PhrasesTransformer -from gensim.corpora import mmcorpus, Dictionary -from gensim import matutils, models -from gensim.test.utils import datapath, common_texts - -AZURE = bool(os.environ.get('PIPELINE_WORKSPACE')) - -texts = [ - ['complier', 'system', 'computer'], - ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'], - ['graph', 'flow', 'network', 'graph'], - ['loading', 'computer', 'system'], - ['user', 'server', 'system'], - ['tree', 'hamiltonian'], - ['graph', 'trees'], - ['computer', 'kernel', 'malfunction', 'computer'], - ['server', 'system', 'computer'], -] -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] -author2doc = { - 'john': [0, 1, 2, 3, 4, 5, 6], - 'jane': [2, 3, 4, 5, 6, 7, 8], - 'jack': [0, 2, 4, 6, 8], - 'jill': [1, 3, 5, 7] -} - -texts_new = texts[0:3] -author2doc_new = { - 'jill': [0], - 'bob': [0, 1], - 'sally': [1, 2] -} -dictionary_new = Dictionary(texts_new) -corpus_new = [dictionary_new.doc2bow(text) for text in texts_new] - -texts_ldaseq = [ - [ - u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', - u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior' - ], - [ - u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', - u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', - u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', - u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', - u'hiring', u'conducting', u'interviews' - ], - [ - u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', - u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', - u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', - u'participating' - ], - [ - u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', - u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', - u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', - u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', - u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', - u'openings', u'jobs' - ], - [ - u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', - u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', - u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', - u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', - u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', - u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', - u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', - u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', - u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', - u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', - u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', - u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', - u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', - u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', - u'skills', u'engineering', u'quality', u'engineering' - ], - [ - u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', - u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', - u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', - u'knowledge', u'applications', u'manipulate', u'applications', u'engineering' - ], - [ - u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', - u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', - u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', - u'solutions', u'biggest', u'insurers', u'operates', u'investment' - ], - [ - u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', - u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', - u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', - u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', - u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', - u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', - u'estimation', u'testing', u'procedures', u'voltage', u'engineering' - ], - [ - u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', - u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', - u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', - u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', - u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', - u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', - u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', - u'contracting', u'southwest', u'electrical', u'contractors' - ], - [ - u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', - u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', - u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', - u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', - u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', - u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', - u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', - u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', - u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', - u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', - u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', - u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', - u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians' - ], - [ - u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', - u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', - u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', - u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', - u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', - u'automated', u'participate', u'ongoing' - ], - [ - u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', - u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', - u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', - u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', - u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', - u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', - u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', - u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', - u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', - u'attach' - ], - [ - u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', - u'asrc', u'engineering', u'technology', u'contracts' - ], - [ - u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', - u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', - u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', - u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', - u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', - u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', - u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', - u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', - u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', - u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', - u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', - u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', - u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', - u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews' - ], - [ - u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', - u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', - u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', - u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', - u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', - u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', - u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', - u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', - u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', - u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', - u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', - u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie' - ], - [ - u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', - u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', - u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', - u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', - u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', - u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures' - ], - [ - u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', - u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', - u'multiple', u'engineering', u'techexpousa', u'reviews' - ], - [ - u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', - u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', - u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', - u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', - u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', - u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', - u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', - u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', - u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', - u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', - u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', - u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration' - ], - [ - u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', - u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', - u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', - u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', - u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', - u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', - u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', - u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', - u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', - u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', - u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', - u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', - u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', - u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', - u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', - u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', - u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', - u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance' - ], - [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'], - ['bank', 'river', 'shore', 'water'], - ['river', 'water', 'flow', 'fast', 'tree'], - ['bank', 'water', 'fall', 'flow'], - ['bank', 'bank', 'water', 'rain', 'river'], - ['river', 'water', 'mud', 'tree'], - ['money', 'transaction', 'bank', 'finance'], - ['bank', 'borrow', 'money'], - ['bank', 'finance'], - ['finance', 'money', 'sell', 'bank'], - ['borrow', 'sell'], - ['bank', 'loan', 'sell'] -] -dictionary_ldaseq = Dictionary(texts_ldaseq) -corpus_ldaseq = [dictionary_ldaseq.doc2bow(text) for text in texts_ldaseq] - -w2v_texts = [ - ['calculus', 'is', 'the', 'mathematical', 'study', 'of', 'continuous', 'change'], - ['geometry', 'is', 'the', 'study', 'of', 'shape'], - ['algebra', 'is', 'the', 'study', 'of', 'generalizations', 'of', 'arithmetic', 'operations'], - ['differential', 'calculus', 'is', 'related', 'to', 'rates', 'of', 'change', 'and', 'slopes', 'of', 'curves'], - ['integral', 'calculus', 'is', 'realted', 'to', 'accumulation', 'of', 'quantities', 'and', - 'the', 'areas', 'under', 'and', 'between', 'curves'], - ['physics', 'is', 'the', 'natural', 'science', 'that', 'involves', 'the', 'study', 'of', 'matter', - 'and', 'its', 'motion', 'and', 'behavior', 'through', 'space', 'and', 'time'], - ['the', 'main', 'goal', 'of', 'physics', 'is', 'to', 'understand', 'how', 'the', 'universe', 'behaves'], - ['physics', 'also', 'makes', 'significant', 'contributions', 'through', 'advances', 'in', 'new', - 'technologies', 'that', 'arise', 'from', 'theoretical', 'breakthroughs'], - ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics', - 'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically', - 'transformed', 'modern', 'day', 'society'], -] - -d2v_sentences = [models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)] - -dict_texts = [' '.join(text) for text in common_texts] - -phrases_sentences = common_texts + [ - ['graph', 'minors', 'survey', 'human', 'interface'], -] - -connector_words = ["of", "the", "was", "are"] -phrases_w_connector_words = [ - [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'], - [u'the', u'mayor', u'of', u'new', u'orleans', u'was', u'there'], - [u'the', u'bank', u'of', u'america', u'offices', u'are', u'open'], - [u'the', u'bank', u'of', u'america', u'offices', u'are', u'closed'], -] - - -class TestLdaWrapper(unittest.TestCase): - def setUp(self): - numpy.random.seed(0) # set fixed seed to get similar values everytime - self.model = LdaTransformer( - id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0) - ) - self.model.fit(corpus) - - def testTransform(self): - texts_new = ['graph', 'eulerian'] - bow = self.model.id2word.doc2bow(texts_new) - matrix = self.model.transform(bow) - self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.num_topics) - texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']] - bow = [] - for i in texts_new: - bow.append(self.model.id2word.doc2bow(i)) - matrix = self.model.transform(bow) - self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], self.model.num_topics) - - def testPartialFit(self): - for i in range(10): - self.model.partial_fit(X=corpus) # fit against the model again - doc = list(corpus)[0] # transform only the first document - transformed = self.model.transform(doc) - expected = numpy.array([0.13, 0.87]) - passed = numpy.allclose(sorted(transformed[0]), sorted(expected), atol=1e-1) - self.assertTrue(passed) - - def testConsistencyWithGensimModel(self): - # training an LdaTransformer with `num_topics`=10 - self.model = LdaTransformer( - id2word=dictionary, num_topics=10, passes=100, minimum_probability=0, random_state=numpy.random.seed(0) - ) - self.model.fit(corpus) - - # training a Gensim LdaModel with the same params - gensim_ldamodel = models.LdaModel( - corpus=corpus, id2word=dictionary, num_topics=10, passes=100, - minimum_probability=0, random_state=numpy.random.seed(0) - ) - - texts_new = ['graph', 'eulerian'] - bow = self.model.id2word.doc2bow(texts_new) - matrix_transformer_api = self.model.transform(bow) - matrix_gensim_model = gensim_ldamodel[bow] - # convert into dense representation to be able to compare with transformer output - matrix_gensim_model_dense = matutils.sparse2full(matrix_gensim_model, 10) - passed = numpy.allclose(matrix_transformer_api, matrix_gensim_model_dense, atol=1e-1) - self.assertTrue(passed) - - def testCSRMatrixConversion(self): - numpy.random.seed(0) # set fixed seed to get similar values everytime - arr = numpy.array([[1, 2, 0], [0, 0, 3], [1, 0, 0]]) - sarr = sparse.csr_matrix(arr) - newmodel = LdaTransformer(num_topics=2, passes=100) - newmodel.fit(sarr) - bow = [(0, 1), (1, 2), (2, 0)] - transformed_vec = newmodel.transform(bow) - expected_vec = numpy.array([0.12843782, 0.87156218]) - passed = numpy.allclose(transformed_vec, expected_vec, atol=1e-1) - self.assertTrue(passed) - - @unittest.skip('see https://github.com/RaRe-Technologies/gensim/issues/3016') - def testPipeline(self): - model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) - with open(datapath('mini_newsgroup'), 'rb') as f: - compressed_content = f.read() - uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') - cache = pickle.loads(uncompressed_content) - data = cache - id2word = Dictionary([x.split() for x in data.data]) - corpus = [id2word.doc2bow(i.split()) for i in data.data] - numpy.random.mtrand.RandomState(1) # set seed for getting same result - clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_lda = Pipeline([('features', model,), ('classifier', clf)]) - text_lda.fit(corpus, data.target) - score = text_lda.score(corpus, data.target) - self.assertGreaterEqual(score, 0.40) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(num_topics=3) - model_params = self.model.get_params() - self.assertEqual(model_params["num_topics"], 3) - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(corpus) - self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3) - - # updating multiple params - param_dict = {"eval_every": 20, "decay": 0.7} - self.model.set_params(**param_dict) - model_params = self.model.get_params() - for key in param_dict.keys(): - self.assertEqual(model_params[key], param_dict[key]) - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(corpus) - self.assertEqual(getattr(self.model.gensim_model, 'eval_every'), 20) - self.assertEqual(getattr(self.model.gensim_model, 'decay'), 0.7) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - texts_new = ['graph', 'eulerian'] - loaded_bow = model_load.id2word.doc2bow(texts_new) - loaded_matrix = model_load.transform(loaded_bow) - - # sanity check for transformation operation - self.assertEqual(loaded_matrix.shape[0], 1) - self.assertEqual(loaded_matrix.shape[1], model_load.num_topics) - - # comparing the original and loaded models - original_bow = self.model.id2word.doc2bow(texts_new) - original_matrix = self.model.transform(original_bow) - passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1) - self.assertTrue(passed) - - def testModelNotFitted(self): - lda_wrapper = LdaTransformer( - id2word=dictionary, num_topics=2, passes=100, - minimum_probability=0, random_state=numpy.random.seed(0) - ) - texts_new = ['graph', 'eulerian'] - bow = lda_wrapper.id2word.doc2bow(texts_new) - self.assertRaises(NotFittedError, lda_wrapper.transform, bow) - - -class TestLsiWrapper(unittest.TestCase): - def setUp(self): - numpy.random.seed(0) # set fixed seed to get similar values everytime - self.model = LsiTransformer(id2word=dictionary, num_topics=2) - self.model.fit(corpus) - - def testTransform(self): - texts_new = ['graph', 'eulerian'] - bow = self.model.id2word.doc2bow(texts_new) - matrix = self.model.transform(bow) - self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.num_topics) - texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']] - bow = [] - for i in texts_new: - bow.append(self.model.id2word.doc2bow(i)) - matrix = self.model.transform(bow) - self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], self.model.num_topics) - - def testPartialFit(self): - for i in range(10): - self.model.partial_fit(X=corpus) # fit against the model again - doc = list(corpus)[0] # transform only the first document - transformed = self.model.transform(doc) - expected = numpy.array([1.39, 0.0]) - passed = numpy.allclose(transformed[0], expected, atol=1) - self.assertTrue(passed) - - @unittest.skip('see https://github.com/RaRe-Technologies/gensim/issues/3016') - def testPipeline(self): - model = LsiTransformer(num_topics=2) - with open(datapath('mini_newsgroup'), 'rb') as f: - compressed_content = f.read() - uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') - cache = pickle.loads(uncompressed_content) - data = cache - id2word = Dictionary([x.split() for x in data.data]) - corpus = [id2word.doc2bow(i.split()) for i in data.data] - numpy.random.mtrand.RandomState(1) # set seed for getting same result - clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_lsi = Pipeline([('features', model,), ('classifier', clf)]) - text_lsi.fit(corpus, data.target) - score = text_lsi.score(corpus, data.target) - self.assertGreater(score, 0.50) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(num_topics=3) - model_params = self.model.get_params() - self.assertEqual(model_params["num_topics"], 3) - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(corpus) - self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3) - - # updating multiple params - param_dict = {"chunksize": 10000, "decay": 0.9} - self.model.set_params(**param_dict) - model_params = self.model.get_params() - for key in param_dict.keys(): - self.assertEqual(model_params[key], param_dict[key]) - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(corpus) - self.assertEqual(getattr(self.model.gensim_model, 'chunksize'), 10000) - self.assertEqual(getattr(self.model.gensim_model, 'decay'), 0.9) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - texts_new = ['graph', 'eulerian'] - loaded_bow = model_load.id2word.doc2bow(texts_new) - loaded_matrix = model_load.transform(loaded_bow) - - # sanity check for transformation operation - self.assertEqual(loaded_matrix.shape[0], 1) - self.assertEqual(loaded_matrix.shape[1], model_load.num_topics) - - # comparing the original and loaded models - original_bow = self.model.id2word.doc2bow(texts_new) - original_matrix = self.model.transform(original_bow) - passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1) - self.assertTrue(passed) - - def testModelNotFitted(self): - lsi_wrapper = LsiTransformer(id2word=dictionary, num_topics=2) - texts_new = ['graph', 'eulerian'] - bow = lsi_wrapper.id2word.doc2bow(texts_new) - self.assertRaises(NotFittedError, lsi_wrapper.transform, bow) - - -class TestLdaSeqWrapper(unittest.TestCase): - def setUp(self): - self.model = LdaSeqTransformer( - id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='gensim', - passes=2, lda_inference_max_iter=10, em_min_iter=1, em_max_iter=4 - ) - self.model.fit(corpus_ldaseq) - - def testTransform(self): - # transforming two documents - docs = [list(corpus_ldaseq)[0], list(corpus_ldaseq)[1]] - transformed_vecs = self.model.transform(docs) - self.assertEqual(transformed_vecs.shape[0], 2) - self.assertEqual(transformed_vecs.shape[1], self.model.num_topics) - - # transforming one document - doc = list(corpus_ldaseq)[0] - transformed_vecs = self.model.transform(doc) - self.assertEqual(transformed_vecs.shape[0], 1) - self.assertEqual(transformed_vecs.shape[1], self.model.num_topics) - - @unittest.skip('see https://github.com/RaRe-Technologies/gensim/issues/3016') - def testPipeline(self): - numpy.random.seed(0) # set fixed seed to get similar values everytime - with open(datapath('mini_newsgroup'), 'rb') as f: - compressed_content = f.read() - uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') - cache = pickle.loads(uncompressed_content) - data = cache - test_data = data.data[0:2] - test_target = data.target[0:2] - id2word = Dictionary([x.split() for x in test_data]) - corpus = [id2word.doc2bow(i.split()) for i in test_data] - model = LdaSeqTransformer( - id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim', - passes=2, lda_inference_max_iter=10, em_min_iter=1, em_max_iter=4 - ) - clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_ldaseq = Pipeline([('features', model,), ('classifier', clf)]) - text_ldaseq.fit(corpus, test_target) - score = text_ldaseq.score(corpus, test_target) - self.assertGreater(score, 0.50) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(num_topics=3) - model_params = self.model.get_params() - self.assertEqual(model_params["num_topics"], 3) - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(corpus_ldaseq) - self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - doc = list(corpus_ldaseq)[0] - loaded_transformed_vecs = model_load.transform(doc) - - # sanity check for transformation operation - self.assertEqual(loaded_transformed_vecs.shape[0], 1) - self.assertEqual(loaded_transformed_vecs.shape[1], model_load.num_topics) - - # comparing the original and loaded models - original_transformed_vecs = self.model.transform(doc) - passed = numpy.allclose(loaded_transformed_vecs, original_transformed_vecs, atol=1e-1) - self.assertTrue(passed) - - def testModelNotFitted(self): - ldaseq_wrapper = LdaSeqTransformer( - num_topics=2, - passes=2, lda_inference_max_iter=10, em_min_iter=1, em_max_iter=4 - ) - doc = list(corpus_ldaseq)[0] - self.assertRaises(NotFittedError, ldaseq_wrapper.transform, doc) - - -class TestRpWrapper(unittest.TestCase): - def setUp(self): - numpy.random.seed(13) - self.model = RpTransformer(num_topics=2) - self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) - self.model.fit(self.corpus) - - def testTransform(self): - # tranform two documents - docs = [list(self.corpus)[0], list(self.corpus)[1]] - matrix = self.model.transform(docs) - self.assertEqual(matrix.shape[0], 2) - self.assertEqual(matrix.shape[1], self.model.num_topics) - - # tranform one document - doc = list(self.corpus)[0] - matrix = self.model.transform(doc) - self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.num_topics) - - @unittest.skip('see https://github.com/RaRe-Technologies/gensim/issues/3016') - def testPipeline(self): - numpy.random.seed(0) # set fixed seed to get similar values everytime - model = RpTransformer(num_topics=2) - with open(datapath('mini_newsgroup'), 'rb') as f: - compressed_content = f.read() - uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') - cache = pickle.loads(uncompressed_content) - data = cache - id2word = Dictionary([x.split() for x in data.data]) - corpus = [id2word.doc2bow(i.split()) for i in data.data] - numpy.random.mtrand.RandomState(1) # set seed for getting same result - clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_rp = Pipeline([('features', model,), ('classifier', clf)]) - text_rp.fit(corpus, data.target) - score = text_rp.score(corpus, data.target) - self.assertGreater(score, 0.40) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(num_topics=3) - model_params = self.model.get_params() - self.assertEqual(model_params["num_topics"], 3) - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(self.corpus) - self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - doc = list(self.corpus)[0] - loaded_transformed_vecs = model_load.transform(doc) - - # sanity check for transformation operation - self.assertEqual(loaded_transformed_vecs.shape[0], 1) - self.assertEqual(loaded_transformed_vecs.shape[1], model_load.num_topics) - - # comparing the original and loaded models - original_transformed_vecs = self.model.transform(doc) - passed = numpy.allclose(loaded_transformed_vecs, original_transformed_vecs, atol=1e-1) - self.assertTrue(passed) - - def testModelNotFitted(self): - rpmodel_wrapper = RpTransformer(num_topics=2) - doc = list(self.corpus)[0] - self.assertRaises(NotFittedError, rpmodel_wrapper.transform, doc) - - -class TestWord2VecWrapper(unittest.TestCase): - def setUp(self): - numpy.random.seed(0) - self.model = W2VTransformer(vector_size=10, min_count=0, seed=42) - self.model.fit(texts) - - def testTransform(self): - # tranform multiple words - words = [] - words = words + texts[0] - matrix = self.model.transform(words) - self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], self.model.vector_size) - - # tranform one word - word = texts[0][0] - matrix = self.model.transform(word) - self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.vector_size) - - def testConsistencyWithGensimModel(self): - # training a W2VTransformer - self.model = W2VTransformer(vector_size=10, min_count=0, seed=42) - self.model.fit(texts) - - # training a Gensim Word2Vec model with the same params - gensim_w2vmodel = models.Word2Vec(texts, vector_size=10, min_count=0, seed=42) - - word = texts[0][0] - vec_transformer_api = self.model.transform(word) # vector returned by W2VTransformer - vec_gensim_model = gensim_w2vmodel.wv[word] # vector returned by Word2Vec - passed = numpy.allclose(vec_transformer_api, vec_gensim_model, atol=1e-1) - self.assertTrue(passed) - - def testPipeline(self): - numpy.random.seed(0) # set fixed seed to get similar values everytime - model = W2VTransformer(vector_size=10, min_count=1) - model.fit(w2v_texts) - - class_dict = {'mathematics': 1, 'physics': 0} - train_data = [ - ('calculus', 'mathematics'), ('mathematical', 'mathematics'), - ('geometry', 'mathematics'), ('operations', 'mathematics'), - ('curves', 'mathematics'), ('natural', 'physics'), ('nuclear', 'physics'), - ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics') - ] - train_input = [x[0] for x in train_data] - train_target = [class_dict[x[1]] for x in train_data] - - clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - clf.fit(model.transform(train_input), train_target) - text_w2v = Pipeline([('features', model,), ('classifier', clf)]) - score = text_w2v.score(train_input, train_target) - self.assertGreater(score, 0.40) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(negative=20) - model_params = self.model.get_params() - self.assertEqual(model_params["negative"], 20) - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(texts) - self.assertEqual(getattr(self.model.gensim_model, 'negative'), 20) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - word = texts[0][0] - loaded_transformed_vecs = model_load.transform(word) - - # sanity check for transformation operation - self.assertEqual(loaded_transformed_vecs.shape[0], 1) - self.assertEqual(loaded_transformed_vecs.shape[1], model_load.vector_size) - - # comparing the original and loaded models - original_transformed_vecs = self.model.transform(word) - passed = numpy.allclose(loaded_transformed_vecs, original_transformed_vecs, atol=1e-1) - self.assertTrue(passed) - - def testModelNotFitted(self): - w2vmodel_wrapper = W2VTransformer(vector_size=10, min_count=0, seed=42) - word = texts[0][0] - self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word) - - -class TestAuthorTopicWrapper(unittest.TestCase): - def setUp(self): - self.model = AuthorTopicTransformer(id2word=dictionary, author2doc=author2doc, num_topics=2, passes=100) - self.model.fit(corpus) - - def testTransform(self): - # transforming multiple authors - author_list = ['jill', 'jack'] - author_topics = self.model.transform(author_list) - self.assertEqual(author_topics.shape[0], 2) - self.assertEqual(author_topics.shape[1], self.model.num_topics) - - # transforming one author - jill_topics = self.model.transform('jill') - self.assertEqual(jill_topics.shape[0], 1) - self.assertEqual(jill_topics.shape[1], self.model.num_topics) - - def testPartialFit(self): - self.model.partial_fit(corpus_new, author2doc=author2doc_new) - - # Did we learn something about Sally? - output_topics = self.model.transform('sally') - sally_topics = output_topics[0] # getting the topics corresponding to 'sally' (from the list of lists) - self.assertTrue(all(sally_topics > 0)) - - def testPipeline(self): - # train the AuthorTopic model first - model = AuthorTopicTransformer(id2word=dictionary, author2doc=author2doc, num_topics=10, passes=100) - model.fit(corpus) - - # create and train clustering model - clstr = cluster.MiniBatchKMeans(n_clusters=2) - authors_full = ['john', 'jane', 'jack', 'jill'] - clstr.fit(model.transform(authors_full)) - - # stack together the two models in a pipeline - text_atm = Pipeline([('features', model,), ('cluster', clstr)]) - author_list = ['jane', 'jack', 'jill'] - ret_val = text_atm.predict(author_list) - self.assertEqual(len(ret_val), len(author_list)) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(num_topics=3) - model_params = self.model.get_params() - self.assertEqual(model_params["num_topics"], 3) - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(corpus) - self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3) - - # updating multiple params - param_dict = {"passes": 5, "iterations": 10} - self.model.set_params(**param_dict) - model_params = self.model.get_params() - for key in param_dict.keys(): - self.assertEqual(model_params[key], param_dict[key]) - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(corpus) - self.assertEqual(getattr(self.model.gensim_model, 'passes'), 5) - self.assertEqual(getattr(self.model.gensim_model, 'iterations'), 10) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - author_list = ['jill'] - loaded_author_topics = model_load.transform(author_list) - - # sanity check for transformation operation - self.assertEqual(loaded_author_topics.shape[0], 1) - self.assertEqual(loaded_author_topics.shape[1], self.model.num_topics) - - # comparing the original and loaded models - original_author_topics = self.model.transform(author_list) - passed = numpy.allclose(loaded_author_topics, original_author_topics, atol=1e-1) - self.assertTrue(passed) - - def testModelNotFitted(self): - atmodel_wrapper = AuthorTopicTransformer(id2word=dictionary, author2doc=author2doc, num_topics=10, passes=100) - author_list = ['jill', 'jack'] - self.assertRaises(NotFittedError, atmodel_wrapper.transform, author_list) - - -class TestD2VTransformer(unittest.TestCase): - def setUp(self): - numpy.random.seed(0) - self.model = D2VTransformer(min_count=1) - self.model.fit(d2v_sentences) - - def testTransform(self): - # tranform multiple documents - docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]] - matrix = self.model.transform(docs) - self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], self.model.vector_size) - - # tranform one document - doc = w2v_texts[0] - matrix = self.model.transform(doc) - self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.vector_size) - - def testFitTransform(self): - model = D2VTransformer(min_count=1) - - # fit and transform multiple documents - docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]] - matrix = model.fit_transform(docs) - self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], model.vector_size) - - # fit and transform one document - doc = w2v_texts[0] - matrix = model.fit_transform(doc) - self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], model.vector_size) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(negative=20) - model_params = self.model.get_params() - self.assertEqual(model_params["negative"], 20) - - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(d2v_sentences) - self.assertEqual(getattr(self.model.gensim_model, 'negative'), 20) - - def testPipeline(self): - numpy.random.seed(0) # set fixed seed to get similar values everytime - model = D2VTransformer(min_count=1) - model.fit(d2v_sentences) - - class_dict = {'mathematics': 1, 'physics': 0} - train_data = [ - (['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'), - (['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics') - ] - train_input = [x[0] for x in train_data] - train_target = [class_dict[x[1]] for x in train_data] - - clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - clf.fit(model.transform(train_input), train_target) - text_w2v = Pipeline([('features', model,), ('classifier', clf)]) - score = text_w2v.score(train_input, train_target) - self.assertGreater(score, 0.40) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - doc = w2v_texts[0] - loaded_transformed_vecs = model_load.transform(doc) - - # sanity check for transformation operation - self.assertEqual(loaded_transformed_vecs.shape[0], 1) - self.assertEqual(loaded_transformed_vecs.shape[1], model_load.vector_size) - - # comparing the original and loaded models - original_transformed_vecs = self.model.transform(doc) - passed = numpy.allclose(sorted(loaded_transformed_vecs), sorted(original_transformed_vecs), atol=1e-1) - self.assertTrue(passed) - - def testConsistencyWithGensimModel(self): - # training a D2VTransformer - self.model = D2VTransformer(min_count=1) - self.model.fit(d2v_sentences) - - # training a Gensim Doc2Vec model with the same params - gensim_d2vmodel = models.Doc2Vec(d2v_sentences, min_count=1) - - doc = w2v_texts[0] - vec_transformer_api = self.model.transform(doc) # vector returned by D2VTransformer - vec_gensim_model = gensim_d2vmodel[doc] # vector returned by Doc2Vec - passed = numpy.allclose(vec_transformer_api, vec_gensim_model, atol=1e-1) - self.assertTrue(passed) - - def testModelNotFitted(self): - d2vmodel_wrapper = D2VTransformer(min_count=1) - self.assertRaises(NotFittedError, d2vmodel_wrapper.transform, 1) - - -class TestText2BowTransformer(unittest.TestCase): - def setUp(self): - numpy.random.seed(0) - self.model = Text2BowTransformer() - self.model.fit(dict_texts) - - def testTransform(self): - # tranform one document - doc = ['computer system interface time computer system'] - bow_vec = self.model.transform(doc)[0] - expected_values = [1, 1, 2, 2] # comparing only the word-counts - values = [x[1] for x in bow_vec] - self.assertEqual(sorted(expected_values), sorted(values)) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(prune_at=1000000) - model_params = self.model.get_params() - self.assertEqual(model_params["prune_at"], 1000000) - - @unittest.skip('see https://github.com/RaRe-Technologies/gensim/issues/3016') - def testPipeline(self): - with open(datapath('mini_newsgroup'), 'rb') as f: - compressed_content = f.read() - uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') - cache = pickle.loads(uncompressed_content) - data = cache - text2bow_model = Text2BowTransformer() - lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) - numpy.random.mtrand.RandomState(1) # set seed for getting same result - clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_lda = Pipeline([('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf)]) - text_lda.fit(data.data, data.target) - score = text_lda.score(data.data, data.target) - self.assertGreater(score, 0.40) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - doc = dict_texts[0] - loaded_transformed_vecs = model_load.transform(doc) - - # comparing the original and loaded models - original_transformed_vecs = self.model.transform(doc) - self.assertEqual(original_transformed_vecs, loaded_transformed_vecs) - - def testModelNotFitted(self): - text2bow_wrapper = Text2BowTransformer() - self.assertRaises(NotFittedError, text2bow_wrapper.transform, dict_texts[0]) - - -class TestTfIdfTransformer(unittest.TestCase): - def setUp(self): - numpy.random.seed(0) - self.model = TfIdfTransformer(normalize=True) - self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) - self.model.fit(self.corpus) - - def testTransform(self): - # tranform one document - doc = corpus[0] - transformed_doc = self.model.transform(doc) - expected_doc = [[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]] - self.assertTrue(numpy.allclose(transformed_doc, expected_doc)) - - # tranform multiple documents - docs = [corpus[0], corpus[1]] - transformed_docs = self.model.transform(docs) - expected_docs = [ - [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)], - [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), - (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)] - ] - self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(smartirs='nnn') - model_params = self.model.get_params() - self.assertEqual(model_params["smartirs"], 'nnn') - - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(self.corpus) - self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn') - - @unittest.skip('see https://github.com/RaRe-Technologies/gensim/issues/3016') - def testPipeline(self): - with open(datapath('mini_newsgroup'), 'rb') as f: - compressed_content = f.read() - uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') - cache = pickle.loads(uncompressed_content) - data = cache - id2word = Dictionary([x.split() for x in data.data]) - corpus = [id2word.doc2bow(i.split()) for i in data.data] - tfidf_model = TfIdfTransformer() - tfidf_model.fit(corpus) - lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) - numpy.random.mtrand.RandomState(1) # set seed for getting same result - clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_tfidf = Pipeline([('tfidf_model', tfidf_model), ('ldamodel', lda_model), ('classifier', clf)]) - text_tfidf.fit(corpus, data.target) - score = text_tfidf.score(corpus, data.target) - self.assertGreater(score, 0.40) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - doc = corpus[0] - loaded_transformed_doc = model_load.transform(doc) - - # comparing the original and loaded models - original_transformed_doc = self.model.transform(doc) - self.assertEqual(original_transformed_doc, loaded_transformed_doc) - - def testModelNotFitted(self): - tfidf_wrapper = TfIdfTransformer() - self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0]) - - -class TestHdpTransformer(unittest.TestCase): - def setUp(self): - numpy.random.seed(0) - self.model = HdpTransformer(id2word=dictionary, random_state=42) - self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) - self.model.fit(self.corpus) - - @unittest.skipIf(AZURE, 'see ') - def testTransform(self): - # tranform one document - doc = self.corpus[0] - transformed_doc = self.model.transform(doc) - expected_doc = [ - [0.81043386270128193, 0.049357139518070477, 0.035840906753517532, - 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148] - ] - self.assertTrue(numpy.allclose(transformed_doc, expected_doc, atol=1e-2)) - - # tranform multiple documents - docs = [self.corpus[0], self.corpus[1]] - transformed_docs = self.model.transform(docs) - expected_docs = [ - [0.81043386270128193, 0.049357139518070477, 0.035840906753517532, - 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148], - [0.03795908, 0.39542609, 0.50650585, 0.0151082, 0.01132749, 0., 0.] - ] - self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0], atol=1e-2)) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1], atol=1e-2)) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(var_converge=0.05) - model_params = self.model.get_params() - self.assertEqual(model_params["var_converge"], 0.05) - - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(self.corpus) - self.assertEqual(getattr(self.model.gensim_model, 'm_var_converge'), 0.05) - - @unittest.skip('see https://github.com/RaRe-Technologies/gensim/issues/3016') - def testPipeline(self): - with open(datapath('mini_newsgroup'), 'rb') as f: - compressed_content = f.read() - uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') - cache = pickle.loads(uncompressed_content) - data = cache - id2word = Dictionary([x.split() for x in data.data]) - corpus = [id2word.doc2bow(i.split()) for i in data.data] - model = HdpTransformer(id2word=id2word) - clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - text_lda = Pipeline([('features', model,), ('classifier', clf)]) - text_lda.fit(corpus, data.target) - score = text_lda.score(corpus, data.target) - self.assertGreater(score, 0.40) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - doc = corpus[0] - loaded_transformed_doc = model_load.transform(doc) - - # comparing the original and loaded models - original_transformed_doc = self.model.transform(doc) - self.assertTrue(numpy.allclose(original_transformed_doc, loaded_transformed_doc)) - - def testModelNotFitted(self): - hdp_wrapper = HdpTransformer(id2word=dictionary) - self.assertRaises(NotFittedError, hdp_wrapper.transform, corpus[0]) - - -class TestPhrasesTransformer(unittest.TestCase): - def setUp(self): - numpy.random.seed(0) - self.model = PhrasesTransformer(min_count=1, threshold=1) - self.model.fit(phrases_sentences) - - def testTransform(self): - # tranform one document - doc = phrases_sentences[-1] - phrase_tokens = self.model.transform(doc)[0] - expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface'] - self.assertEqual(phrase_tokens, expected_phrase_tokens) - - def testPartialFit(self): - new_sentences = [ - ['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'], - ['world', 'peace', 'people'], - ['world', 'peace', 'humans'], - ] - self.model.partial_fit(X=new_sentences) # train model with new sentences - - doc = ['graph', 'minors', 'survey', 'human', 'interface', 'world', 'peace'] - phrase_tokens = self.model.transform(doc)[0] - expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface', u'world_peace'] - self.assertEqual(phrase_tokens, expected_phrase_tokens) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(progress_per=5000) - model_params = self.model.get_params() - self.assertEqual(model_params["progress_per"], 5000) - - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(phrases_sentences) - self.assertEqual(getattr(self.model.gensim_model, 'progress_per'), 5000) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - doc = phrases_sentences[-1] - loaded_phrase_tokens = model_load.transform(doc) - - # comparing the original and loaded models - original_phrase_tokens = self.model.transform(doc) - self.assertEqual(original_phrase_tokens, loaded_phrase_tokens) - - def testModelNotFitted(self): - phrases_transformer = PhrasesTransformer() - self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0]) - - -class TestPhrasesTransformerCommonTerms(unittest.TestCase): - def setUp(self): - self.model = PhrasesTransformer(min_count=1, threshold=1, connector_words=connector_words) - self.expected_transformations = [ - [u'the', u'mayor_of_new', u'york', u'was', u'there'], - [u'the', u'mayor_of_new', u'orleans', u'was', u'there'], - [u'the', u'bank_of_america', u'offices', u'are', u'open'], - [u'the', u'bank_of_america', u'offices', u'are', u'closed'] - ] - - def testFitAndTransform(self): - self.model.fit(phrases_w_connector_words) - - transformed = self.model.transform(phrases_w_connector_words) - self.assertEqual(transformed, self.expected_transformations) - - def testFitTransform(self): - transformed = self.model.fit_transform(phrases_w_connector_words) - self.assertEqual(transformed, self.expected_transformations) - - def testPartialFit(self): - # fit half of the sentences - self.model.fit(phrases_w_connector_words[:2]) - - expected_transformations_0 = [ - [u'the', u'mayor_of_new', u'york', u'was', u'there'], - [u'the', u'mayor_of_new', u'orleans', u'was', u'there'], - [u'the', u'bank', u'of', u'america', u'offices', u'are', u'open'], - [u'the', u'bank', u'of', u'america', u'offices', u'are', u'closed'] - ] - # transform all sentences, second half should be same as original - transformed_0 = self.model.transform(phrases_w_connector_words) - self.assertEqual(transformed_0, expected_transformations_0) - - # fit remaining sentences, result should be the same as in the other tests - self.model.partial_fit(phrases_w_connector_words[2:]) - transformed_1 = self.model.fit_transform(phrases_w_connector_words) - self.assertEqual(transformed_1, self.expected_transformations) - - new_phrases = [[u'offices', u'are', u'open'], [u'offices', u'are', u'closed']] - self.model.partial_fit(new_phrases) - expected_transformations_2 = [ - [u'the', u'mayor_of_new', u'york', u'was', u'there'], - [u'the', u'mayor_of_new', u'orleans', u'was', u'there'], - [u'the', u'bank_of_america', u'offices_are_open'], - [u'the', u'bank_of_america', u'offices_are_closed'] - ] - transformed_2 = self.model.transform(phrases_w_connector_words) - self.assertEqual(transformed_2, expected_transformations_2) - - -# For testing pluggable scoring in Phrases – must remain pickleable. -def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): - return 1 - - -class TestPhrasesTransformerCustomScorer(unittest.TestCase): - - def setUp(self): - numpy.random.seed(0) - - self.model = PhrasesTransformer(min_count=1, threshold=.9, scoring=dumb_scorer) - self.model.fit(phrases_sentences) - - def testTransform(self): - # tranform one document - doc = phrases_sentences[-1] - phrase_tokens = self.model.transform(doc)[0] - expected_phrase_tokens = [u'graph_minors', u'survey_human', u'interface'] - self.assertEqual(phrase_tokens, expected_phrase_tokens) - - def testPartialFit(self): - new_sentences = [ - ['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'], - ['world', 'peace', 'people'], - ['world', 'peace', 'humans'] - ] - self.model.partial_fit(X=new_sentences) # train model with new sentences - - doc = ['graph', 'minors', 'survey', 'human', 'interface', 'world', 'peace'] - phrase_tokens = self.model.transform(doc)[0] - expected_phrase_tokens = [u'graph_minors', u'survey_human', u'interface', u'world_peace'] - self.assertEqual(phrase_tokens, expected_phrase_tokens) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(progress_per=5000) - model_params = self.model.get_params() - self.assertEqual(model_params["progress_per"], 5000) - - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(phrases_sentences) - self.assertEqual(getattr(self.model.gensim_model, 'progress_per'), 5000) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - doc = phrases_sentences[-1] - loaded_phrase_tokens = model_load.transform(doc) - - # comparing the original and loaded models - original_phrase_tokens = self.model.transform(doc) - self.assertEqual(original_phrase_tokens, loaded_phrase_tokens) - - def testModelNotFitted(self): - phrases_transformer = PhrasesTransformer() - self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0]) - - -class TestFTTransformer(unittest.TestCase): - def setUp(self): - self.model = FTTransformer(vector_size=10, min_count=0, seed=42, bucket=5000) - self.model.fit(texts) - - def testTransform(self): - # tranform multiple words - words = [] - words = words + texts[0] - matrix = self.model.transform(words) - self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], self.model.vector_size) - - # tranform one word - word = texts[0][0] - matrix = self.model.transform(word) - self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.vector_size) - - # verify oov-word vector retrieval - invocab_vec = self.model.transform("computer") # invocab word - self.assertEqual(invocab_vec.shape[0], 1) - self.assertEqual(invocab_vec.shape[1], self.model.vector_size) - - oov_vec = self.model.transform('compute') # oov word - self.assertEqual(oov_vec.shape[0], 1) - self.assertEqual(oov_vec.shape[1], self.model.vector_size) - - def testConsistencyWithGensimModel(self): - # training a FTTransformer - self.model = FTTransformer(vector_size=10, min_count=0, seed=42, workers=1, bucket=5000) - self.model.fit(texts) - - # training a Gensim FastText model with the same params - gensim_ftmodel = models.FastText(texts, vector_size=10, min_count=0, seed=42, workers=1, bucket=5000) - - # vectors returned by FTTransformer - vecs_transformer_api = self.model.transform( - [text for text_list in texts for text in text_list]) - # vectors returned by FastText - vecs_gensim_model = [gensim_ftmodel.wv[text] for text_list in texts for text in text_list] - passed = numpy.allclose(vecs_transformer_api, vecs_gensim_model) - self.assertTrue(passed) - - # test for out of vocab words - oov_words = ["compute", "serve", "sys", "net"] - vecs_transformer_api = self.model.transform(oov_words) # vector returned by FTTransformer - vecs_gensim_model = [gensim_ftmodel.wv[word] for word in oov_words] # vector returned by FastText - passed = numpy.allclose(vecs_transformer_api, vecs_gensim_model) - self.assertTrue(passed) - - def testPipeline(self): - model = FTTransformer(vector_size=10, min_count=1, bucket=5000) - model.fit(w2v_texts) - - class_dict = {'mathematics': 1, 'physics': 0} - train_data = [ - ('calculus', 'mathematics'), ('mathematical', 'mathematics'), - ('geometry', 'mathematics'), ('operations', 'mathematics'), - ('curves', 'mathematics'), ('natural', 'physics'), ('nuclear', 'physics'), - ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics') - ] - train_input = [x[0] for x in train_data] - train_target = [class_dict[x[1]] for x in train_data] - - clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - clf.fit(model.transform(train_input), train_target) - text_ft = Pipeline([('features', model,), ('classifier', clf)]) - score = text_ft.score(train_input, train_target) - self.assertGreater(score, 0.40) - - def testSetGetParams(self): - # updating only one param - self.model.set_params(negative=20) - model_params = self.model.get_params() - self.assertEqual(model_params["negative"], 20) - # verify that the attributes values are also changed for `gensim_model` after fitting - self.model.fit(texts) - self.assertEqual(getattr(self.model.gensim_model, 'negative'), 20) - - def testPersistence(self): - model_dump = pickle.dumps(self.model) - model_load = pickle.loads(model_dump) - - # pass all words in one list - words = [word for text_list in texts for word in text_list] - loaded_transformed_vecs = model_load.transform(words) - - # sanity check for transformation operation - self.assertEqual(loaded_transformed_vecs.shape[0], len(words)) - self.assertEqual(loaded_transformed_vecs.shape[1], model_load.vector_size) - - # comparing the original and loaded models - original_transformed_vecs = self.model.transform(words) - passed = numpy.allclose(loaded_transformed_vecs, original_transformed_vecs, atol=1e-1) - self.assertTrue(passed) - - def testModelNotFitted(self): - ftmodel_wrapper = FTTransformer(vector_size=10, min_count=0, seed=42, bucket=5000) - word = texts[0][0] - self.assertRaises(NotFittedError, ftmodel_wrapper.transform, word) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_varembed_wrapper.py b/gensim/test/test_varembed_wrapper.py deleted file mode 100644 index 9d0a16d6e3..0000000000 --- a/gensim/test/test_varembed_wrapper.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2017 Anmol Gulati -# Copyright (C) 2017 Radim Rehurek -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated tests for VarEmbed wrapper. -""" - -import logging -import os -import sys - -import numpy as np - -import unittest - -from gensim.models.wrappers import varembed -from gensim.test.utils import datapath - -try: - import morfessor # noqa: F401 -except ImportError: - raise unittest.SkipTest("Test requires Morfessor to be installed, which is not available") - - -varembed_model_vector_file = datapath('varembed_vectors.pkl') -varembed_model_morfessor_file = datapath('varembed_morfessor.bin') - -AZURE = bool(os.environ.get('PIPELINE_WORKSPACE')) - - -@unittest.skipIf(AZURE, 'see ') -class TestVarembed(unittest.TestCase): - def testLoadVarembedFormat(self): - """Test storing/loading the entire model.""" - model = varembed.VarEmbed.load_varembed_format(vectors=varembed_model_vector_file) - self.model_sanity(model) - - def testSimilarity(self): - """Test n_similarity for vocab words""" - model = varembed.VarEmbed.load_varembed_format(vectors=varembed_model_vector_file) - self.assertTrue(model.n_similarity(['result'], ['targets']) == model.similarity('result', 'targets')) - - def model_sanity(self, model): - """Check vocabulary and vector size""" - self.assertEqual(model.vectors.shape, (model.vocab_size, model.vector_size)) - self.assertTrue(model.vectors.shape[0] == len(model)) - - @unittest.skipIf(sys.version_info < (2, 7), 'Supported only on Python 2.7 and above') - def testAddMorphemesToEmbeddings(self): - """Test add morphemes to Embeddings - Test only in Python 2.7 and above. Add Morphemes is not supported in earlier versions. - """ - model = varembed.VarEmbed.load_varembed_format(vectors=varembed_model_vector_file) - model_with_morphemes = varembed.VarEmbed.load_varembed_format( - vectors=varembed_model_vector_file, morfessor_model=varembed_model_morfessor_file) - self.model_sanity(model_with_morphemes) - # Check syn0 is different for both models. - self.assertFalse(np.allclose(model.vectors, model_with_morphemes.vectors)) - - def testLookup(self): - """Test lookup of vector for a particular word and list""" - model = varembed.VarEmbed.load_varembed_format(vectors=varembed_model_vector_file) - self.assertTrue(np.allclose(model['language'], model[['language']])) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py deleted file mode 100644 index 1234a86659..0000000000 --- a/gensim/test/test_wordrank_wrapper.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated tests for checking transformation algorithms (the models package). -""" - - -import logging -import unittest -import os - -import numpy - -from gensim.models.wrappers import wordrank -from gensim.test.utils import datapath, get_tmpfile - - -class TestWordrank(unittest.TestCase): - def setUp(self): - wr_home = os.environ.get('WR_HOME', None) - self.wr_path = wr_home if wr_home else None - self.corpus_file = datapath('lee.cor') - self.out_name = 'testmodel' - self.wr_file = datapath('test_glove.txt') - if not self.wr_path: - return - self.test_model = wordrank.Wordrank.train( - self.wr_path, self.corpus_file, self.out_name, iter=6, - dump_period=5, period=5, np=4, cleanup_files=True - ) - - def testLoadWordrankFormat(self): - """Test model successfully loaded from Wordrank format file""" - model = wordrank.Wordrank.load_wordrank_model(self.wr_file) - vocab_size, dim = 76, 50 - self.assertEqual(model.vectors.shape, (vocab_size, dim)) - self.assertEqual(len(model), vocab_size) - - def testEnsemble(self): - """Test ensemble of two embeddings""" - if not self.wr_path: - return - new_emb = self.test_model.ensemble_embedding(self.wr_file, self.wr_file) - self.assertEqual(new_emb.shape, (76, 50)) - - def testPersistence(self): - """Test storing/loading the entire model""" - if not self.wr_path: - return - tmpf = get_tmpfile('gensim_wordrank.test') - self.test_model.save(tmpf) - loaded = wordrank.Wordrank.load(tmpf) - self.models_equal(self.test_model, loaded) - - def testSimilarity(self): - """Test n_similarity for vocab words""" - if not self.wr_path: - return - self.assertTrue(numpy.allclose(self.test_model.n_similarity(['the', 'and'], ['and', 'the']), 1.0)) - self.assertEqual(self.test_model.similarity('the', 'and'), self.test_model.similarity('the', 'and')) - - def testLookup(self): - if not self.wr_path: - return - self.assertTrue(numpy.allclose(self.test_model['night'], self.test_model[['night']])) - - def models_equal(self, model, model2): - self.assertEqual(len(model), len(model2)) - self.assertEqual(set(model.index_to_key), set(model2.index_to_key)) - self.assertTrue(numpy.allclose(model.syn0, model2.syn0)) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() diff --git a/setup.py b/setup.py index 7fe8ae7815..987ea197e2 100644 --- a/setup.py +++ b/setup.py @@ -275,7 +275,6 @@ def run(self): 'testfixtures', 'Morfessor==2.0.2a4', 'python-Levenshtein >= 0.10.2', - 'scikit-learn', ] # Add additional requirements for testing on Linux that are skipped on Windows. @@ -307,7 +306,6 @@ def run(self): 'memory_profiler', 'annoy', 'Pyro4', - 'scikit-learn', 'nltk', 'testfixtures', 'statsmodels',