{"id":1617,"date":"2023-08-17T03:04:09","date_gmt":"2023-08-17T03:04:09","guid":{"rendered":"https:\/\/www.gptmain.news\/?p=1617"},"modified":"2023-08-17T03:04:09","modified_gmt":"2023-08-17T03:04:09","slug":"%d0%ba%d0%bb%d0%b0%d1%81%d1%82%d0%b5%d1%80%d0%b8%d0%b7%d0%b0%d1%86%d0%b8%d1%8f-%d0%b4%d0%be%d0%ba%d1%83%d0%bc%d0%b5%d0%bd%d1%82%d0%be%d0%b2-%d0%b8%d0%b8-%d0%bb%d0%b5%d1%82%d0%be-gptmain-news","status":"publish","type":"post","link":"https:\/\/gptmain.news\/?p=1617","title":{"rendered":"\u041a\u043b\u0430\u0441\u0442\u0435\u0440\u0438\u0437\u0430\u0446\u0438\u044f \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432 |  \u0418\u0418 \u041b\u0435\u0442\u043e\n | GPTMain News"},"content":{"rendered":"<div id=\"\">\n<p>\u041a\u043b\u0430\u0441\u0442\u0435\u0440\u0438\u0437\u0430\u0446\u0438\u044f \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432 \u2014 \u044d\u0442\u043e \u0437\u0430\u0434\u0430\u0447\u0430 \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432 \u043f\u043e \u0440\u0430\u0437\u043d\u044b\u043c \u0433\u0440\u0443\u043f\u043f\u0430\u043c \u043d\u0430 \u043e\u0441\u043d\u043e\u0432\u0435 \u0438\u0445 \u0442\u0435\u043a\u0441\u0442\u043e\u0432\u043e\u0433\u043e \u0438 \u0441\u0435\u043c\u0430\u043d\u0442\u0438\u0447\u0435\u0441\u043a\u043e\u0433\u043e \u043a\u043e\u043d\u0442\u0435\u043a\u0441\u0442\u0430.  \u042d\u0442\u043e \u043d\u0435\u043a\u043e\u043d\u0442\u0440\u043e\u043b\u0438\u0440\u0443\u0435\u043c\u044b\u0439 \u043c\u0435\u0442\u043e\u0434, \u043f\u043e\u0441\u043a\u043e\u043b\u044c\u043a\u0443 \u0443 \u043d\u0430\u0441 \u043d\u0435\u0442 \u043c\u0435\u0442\u043e\u043a \u0434\u043b\u044f \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432, \u0438 \u043e\u043d \u043f\u0440\u0438\u043c\u0435\u043d\u044f\u0435\u0442\u0441\u044f \u0432 \u0438\u043d\u0444\u043e\u0440\u043c\u0430\u0446\u0438\u043e\u043d\u043d\u043e\u043c \u043f\u043e\u0438\u0441\u043a\u0435 \u0438 \u043f\u043e\u0438\u0441\u043a\u043e\u0432\u044b\u0445 \u0441\u0438\u0441\u0442\u0435\u043c\u0430\u0445.<\/p>\n<p>\u0414\u0430\u0432\u0430\u0439\u0442\u0435 \u043d\u0430\u0447\u043d\u0435\u043c&#8230;<\/p>\n<p>\u0427\u0442\u043e\u0431\u044b \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u0446\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u044b \u043d\u0430 \u043e\u0441\u043d\u043e\u0432\u0435 \u0438\u0445 \u0441\u043e\u0434\u0435\u0440\u0436\u0430\u043d\u0438\u044f, \u044f \u0440\u0435\u0448\u0438\u043b \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0430\u043b\u0433\u043e\u0440\u0438\u0442\u043c \u041a-\u0441\u0440\u0435\u0434\u043d\u0438\u0445.  \u0418\u0437-\u0437\u0430 \u0442\u043e\u0433\u043e, \u0447\u0442\u043e \u044d\u043b\u0435\u043c\u0435\u043d\u0442\u044b \u043d\u0435 \u043f\u043e\u043c\u0435\u0447\u0435\u043d\u044b, \u044d\u0442\u043e \u044f\u0432\u043d\u043e \u043f\u0440\u043e\u0431\u043b\u0435\u043c\u0430 \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u0431\u0435\u0437 \u0443\u0447\u0438\u0442\u0435\u043b\u044f, \u0438 \u043e\u0434\u043d\u0438\u043c \u0438\u0437 \u043b\u0443\u0447\u0448\u0438\u0445 \u0440\u0435\u0448\u0435\u043d\u0438\u0439 \u0434\u043e\u043b\u0436\u043d\u043e \u0431\u044b\u0442\u044c K-Means.  \u041a\u043e\u043d\u0435\u0447\u043d\u043e, \u043c\u044b \u043c\u043e\u0436\u0435\u043c \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0434\u0440\u0443\u0433\u043e\u0439 \u0430\u043b\u0433\u043e\u0440\u0438\u0442\u043c, \u043d\u0430\u043f\u0440\u0438\u043c\u0435\u0440 \u0441\u043c\u0435\u0448\u0430\u043d\u043d\u044b\u0435 \u043c\u043e\u0434\u0435\u043b\u0438 \u0413\u0430\u0443\u0441\u0441\u0430, \u0438\u043b\u0438 \u0434\u0430\u0436\u0435 \u043c\u0435\u0442\u043e\u0434\u044b \u0433\u043b\u0443\u0431\u043e\u043a\u043e\u0433\u043e \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f, \u0442\u0430\u043a\u0438\u0435 \u043a\u0430\u043a \u0430\u0432\u0442\u043e\u044d\u043d\u043a\u043e\u0434\u0435\u0440\u044b.  \u042f \u0431\u0443\u0434\u0443 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c python \u0441 \u0431\u043b\u043e\u043a\u043d\u043e\u0442\u043e\u043c Jupyter, \u0447\u0442\u043e\u0431\u044b \u043e\u0431\u044a\u0435\u0434\u0438\u043d\u0438\u0442\u044c \u043a\u043e\u0434 \u0438 \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u044b \u0441 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u0430\u0446\u0438\u0435\u0439.<\/p>\n<p>\u042f \u0440\u0430\u0437\u0440\u0430\u0431\u0430\u0442\u044b\u0432\u0430\u044e \u043a\u043e\u0434 \u0432 \u0441\u0440\u0435\u0434\u0435 Anaconda \u0438 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u044e \u0441\u043b\u0435\u0434\u0443\u044e\u0449\u0438\u0435 \u0437\u0430\u0432\u0438\u0441\u0438\u043c\u043e\u0441\u0442\u0438:<\/p>\n<p>\u041f\u0430\u043d\u0434\u044b \u0434\u043b\u044f \u043f\u0435\u0440\u0435\u0434\u0430\u0447\u0438 \u0434\u0430\u043d\u043d\u044b\u0445<\/p>\n<p>Sklearn \u0434\u043b\u044f \u043c\u0430\u0448\u0438\u043d\u043d\u043e\u0433\u043e \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u0438 \u043f\u0440\u0435\u0434\u0432\u0430\u0440\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0439 \u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0438<\/p>\n<p>Matplotlib \u0434\u043b\u044f \u043f\u043e\u0441\u0442\u0440\u043e\u0435\u043d\u0438\u044f \u0433\u0440\u0430\u0444\u0438\u043a\u043e\u0432<\/p>\n<p>Ntlk \u0434\u043b\u044f \u0430\u043b\u0433\u043e\u0440\u0438\u0442\u043c\u043e\u0432 \u0435\u0441\u0442\u0435\u0441\u0442\u0432\u0435\u043d\u043d\u043e\u0433\u043e \u044f\u0437\u044b\u043a\u0430<\/p>\n<p>BeautifulSoup \u0434\u043b\u044f \u0430\u043d\u0430\u043b\u0438\u0437\u0430 \u0442\u0435\u043a\u0441\u0442\u0430 \u0438\u0437 XML-\u0444\u0430\u0439\u043b\u0430 \u0438 \u0438\u0437\u0431\u0430\u0432\u043b\u0435\u043d\u0438\u044f \u043e\u0442 \u043a\u0430\u0442\u0435\u0433\u043e\u0440\u0438\u0439<\/p>\n<h2 id=\"parsing-the-data\">\u0410\u043d\u0430\u043b\u0438\u0437 \u0434\u0430\u043d\u043d\u044b\u0445<\/h2>\n<p>\u0424\u0443\u043d\u043a\u0446\u0438\u044f parseXML \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u0442 xml.etree.ElementTree \u0434\u043b\u044f \u0430\u043d\u0430\u043b\u0438\u0437\u0430 \u0434\u0430\u043d\u043d\u044b\u0445.  \u042f \u0440\u0435\u0448\u0438\u043b \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0434\u043b\u044f \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0438\u0437\u0430\u0446\u0438\u0438 \u0442\u043e\u043b\u044c\u043a\u043e \u043d\u0430\u0437\u0432\u0430\u043d\u0438\u0435 \u0438 \u043e\u043f\u0438\u0441\u0430\u043d\u0438\u0435 \u044d\u043b\u0435\u043c\u0435\u043d\u0442\u043e\u0432, \u043d\u0430\u0438\u0431\u043e\u043b\u0435\u0435 \u0440\u0435\u043b\u0435\u0432\u0430\u043d\u0442\u043d\u044b\u0445 \u0441\u0435\u043c\u0430\u0441\u0438\u043e\u043b\u043e\u0433\u0438\u0438.  \u0418\u0437-\u0437\u0430 \u0442\u043e\u0433\u043e, \u0447\u0442\u043e \u043e\u043f\u0438\u0441\u0430\u043d\u0438\u0435 \u043d\u0435 \u044f\u0432\u043b\u044f\u0435\u0442\u0441\u044f raw tex, \u043c\u044b \u0438\u0437\u0432\u043b\u0435\u043a\u0430\u0435\u043c \u0442\u0435\u043a\u0441\u0442 \u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u043e\u0439 BeautifulSoup, \u043a\u0430\u043a \u044f \u0443\u0436\u0435 \u0443\u043f\u043e\u043c\u0438\u043d\u0430\u043b.  \u0422\u0430\u043a\u0436\u0435 \u043c\u044b \u043e\u0442\u0431\u0440\u0430\u0441\u044b\u0432\u0430\u0435\u043c \u044d\u043b\u0435\u043c\u0435\u043d\u0442\u044b \u0441 \u043e\u0447\u0435\u043d\u044c \u043c\u0430\u043b\u0435\u043d\u044c\u043a\u0438\u043c \u043e\u043f\u0438\u0441\u0430\u043d\u0438\u0435\u043c, \u043f\u043e\u0442\u043e\u043c\u0443 \u0447\u0442\u043e \u043e\u043d\u0438 \u0432\u043b\u0438\u044f\u044e\u0442 \u043d\u0430 \u043e\u043a\u043e\u043d\u0447\u0430\u0442\u0435\u043b\u044c\u043d\u0443\u044e \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0438\u0437\u0430\u0446\u0438\u044e.  \u041c\u043e\u0436\u043d\u043e \u0441\u0447\u0438\u0442\u0430\u0442\u044c, \u0447\u0442\u043e \u0432\u0441\u0435 \u043e\u043d\u0438 \u043f\u0440\u0438\u043d\u0430\u0434\u043b\u0435\u0436\u0430\u0442 \u0434\u043e\u043f\u043e\u043b\u043d\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u043c\u0443 \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0443.  \u041a\u043e\u043d\u0435\u0447\u043d\u043e, \u0435\u0441\u0442\u044c \u0441\u043f\u043e\u0441\u043e\u0431\u044b \u0438\u0445 \u0432\u043a\u043b\u044e\u0447\u0438\u0442\u044c, \u043d\u043e \u044f \u043f\u043e\u043a\u0430 \u0438\u043c\u0438 \u043d\u0435 \u043f\u043e\u043b\u044c\u0437\u0443\u044e\u0441\u044c.<\/p>\n<pre class=\"prism-code language-python\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> xml<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">etree<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">ElementTree <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">as<\/span><span class=\"token plain\"> ET<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> pandas <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">as<\/span><span class=\"token plain\"> pd<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> nltk<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">from<\/span><span class=\"token plain\"> sklearn<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">cluster <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> KMeans<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">from<\/span><span class=\"token plain\"> sklearn<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">externals <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> joblib<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">from<\/span><span class=\"token plain\"> sklearn<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">feature_extraction<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">text <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> TfidfVectorizer<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">from<\/span><span class=\"token plain\"> sklearn<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">metrics<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">pairwise <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> cosine_similarity<\/span><\/p><p><span class=\"token plain\">nltk<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">download<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'punkt'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">from<\/span><span class=\"token plain\"> bs4 <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> BeautifulSoup<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">from<\/span><span class=\"token plain\"> nltk <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> SnowballStemmer<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> re<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">def<\/span><span class=\"token plain\"> <\/span><span class=\"token function\" style=\"color:rgb(80, 250, 123)\">parseXML<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">xmlfile<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    tree <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> ET<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">parse<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">xmlfile<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    root <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> tree<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">getroot<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    titles<\/span><span class=\"token operator\">=<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    descriptions<\/span><span class=\"token operator\">=<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> item <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> root<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">findall<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'.\/channel\/item'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">        <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> child <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> item<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">            <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">if<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">child<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">tag<\/span><span class=\"token operator\">==<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'title'<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">                titles<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">append<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">child<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">text<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">            <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">if<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">child<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">tag <\/span><span class=\"token operator\">==<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'description'<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">                soup <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> BeautifulSoup<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token builtin\" style=\"color:rgb(189, 147, 249)\">str<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">child<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">text<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">encode<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'utf8'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'ignore'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">\"lxml\"<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">                strtext<\/span><span class=\"token operator\">=<\/span><span class=\"token plain\">soup<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">text<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">replace<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">u'\\xa0'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">u' '<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">replace<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'\\n'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">' '<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">                descriptions<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">append<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">strtext<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">return<\/span><span class=\"token plain\"> titles<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\">descriptions<\/span><\/p><p><span class=\"token plain\">bef_titles<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\">bef_descriptions <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> parseXML<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'data.source.rss-feeds.xml'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">print<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'Count of items before dropping:'<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token builtin\" style=\"color:rgb(189, 147, 249)\">len<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">bef_titles<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">titles<\/span><span class=\"token operator\">=<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">descriptions<\/span><span class=\"token operator\">=<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> i <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> <\/span><span class=\"token builtin\" style=\"color:rgb(189, 147, 249)\">range<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token builtin\" style=\"color:rgb(189, 147, 249)\">len<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">bef_titles<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">if<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\"> <\/span><span class=\"token builtin\" style=\"color:rgb(189, 147, 249)\">len<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">bef_descriptions<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">i<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"> <\/span><span class=\"token operator\">&gt;<\/span><span class=\"token plain\"> <\/span><span class=\"token number\">500<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">        titles<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">append<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">bef_titles<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">i<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">        descriptions<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">append<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">bef_descriptions<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">i<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">print<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'Count of items after:'<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token builtin\" style=\"color:rgb(189, 147, 249)\">len<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">titles<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><\/p><\/pre>\n<pre class=\"prism-code language-\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token plain\">[nltk_data] Downloading package punkt to<\/span><\/p><p><span class=\"token plain\">[nltk_data]     C:\\Users\\sergi\\AppData\\Roaming\\nltk_data...<\/span><\/p><p><span class=\"token plain\">[nltk_data]   Package punkt is already up-to-date!<\/span><\/p><p><span class=\"token plain\">Count of items before dropping: 1662<\/span><\/p><p><span class=\"token plain\">Count of items after: 1130<\/span><\/p><\/pre>\n<h2 id=\"tokenizing-and-stemming\">\u0422\u043e\u043a\u0435\u043d\u0438\u0437\u0430\u0446\u0438\u044f \u0438 \u0441\u0442\u0435\u043c\u043c\u0438\u043d\u0433<\/h2>\n<p>\u0421\u043b\u0435\u0434\u0443\u044e\u0449\u0438\u043c \u0448\u0430\u0433\u043e\u043c \u044f\u0432\u043b\u044f\u0435\u0442\u0441\u044f \u0440\u0430\u0437\u0431\u0438\u0435\u043d\u0438\u0435 \u0442\u0435\u043a\u0441\u0442\u0430 \u043d\u0430 \u0441\u043b\u043e\u0432\u0430, \u0443\u0434\u0430\u043b\u0435\u043d\u0438\u0435 \u0432\u0441\u0435\u0445 \u043c\u043e\u0440\u0444\u043e\u043b\u043e\u0433\u0438\u0447\u0435\u0441\u043a\u0438\u0445 \u0430\u0444\u0444\u0438\u043a\u0441\u043e\u0432 \u0438 \u0443\u0434\u0430\u043b\u0435\u043d\u0438\u0435 \u043e\u0431\u0449\u0438\u0445 \u0441\u043b\u043e\u0432, \u0442\u0430\u043a\u0438\u0445 \u043a\u0430\u043a \u0430\u0440\u0442\u0438\u043a\u043b\u0438 \u0438 \u043f\u0440\u0435\u0434\u043b\u043e\u0433\u0438.  \u042d\u0442\u043e \u043c\u043e\u0436\u043d\u043e \u0441\u0434\u0435\u043b\u0430\u0442\u044c \u0441 \u043f\u043e\u043c\u043e\u0449\u044c\u044e \u0432\u0441\u0442\u0440\u043e\u0435\u043d\u043d\u044b\u0445 \u0444\u0443\u043d\u043a\u0446\u0438\u0439 ntlk.  \u0412 \u043a\u043e\u043d\u0446\u0435 \u043a\u043e\u043d\u0446\u043e\u0432, \u043c\u044b \u043f\u043e\u043b\u0443\u0447\u0430\u0435\u043c \u0434\u0432\u0430 \u0440\u0430\u0437\u043d\u044b\u0445 \u0441\u043b\u043e\u0432\u0430\u0440\u044f (\u043e\u0434\u0438\u043d \u0442\u043e\u043a\u0435\u043d\u0438\u0437\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0439 \u0438 \u0431\u0430\u0437\u043e\u0432\u044b\u0439 \u0438 \u043e\u0434\u0438\u043d \u0442\u043e\u043b\u044c\u043a\u043e \u0442\u043e\u043a\u0435\u043d\u0438\u0437\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0439), \u0438 \u043c\u044b \u043e\u0431\u044a\u0435\u0434\u0438\u043d\u044f\u0435\u043c \u0438\u0445 \u0432 \u043a\u0430\u0434\u0440 \u0434\u0430\u043d\u043d\u044b\u0445 pandas.<\/p>\n<pre class=\"prism-code language-python\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">def<\/span><span class=\"token plain\"> <\/span><span class=\"token function\" style=\"color:rgb(80, 250, 123)\">tokenize_and_stem<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">text<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    tokens <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">word <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> sent <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> nltk<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">sent_tokenize<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">text<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"> <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> word <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> nltk<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">word_tokenize<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">sent<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    filtered_tokens <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> token <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> tokens<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">        <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">if<\/span><span class=\"token plain\"> re<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">search<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'[a-zA-Z]'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> token<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">            filtered_tokens<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">append<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">token<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    stems <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">stemmer<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">stem<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">t<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"> <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> t <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> filtered_tokens<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">return<\/span><span class=\"token plain\"> stems<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">def<\/span><span class=\"token plain\"> <\/span><span class=\"token function\" style=\"color:rgb(80, 250, 123)\">tokenize_only<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">text<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    tokens <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">word<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">lower<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"> <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> sent <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> nltk<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">sent_tokenize<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">text<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"> <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> word <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> nltk<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">word_tokenize<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">sent<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    filtered_tokens <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> token <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> tokens<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">        <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">if<\/span><span class=\"token plain\"> re<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">search<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'[a-zA-Z]'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> token<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">            filtered_tokens<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">append<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">token<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">return<\/span><span class=\"token plain\"> filtered_tokens<\/span><\/p><\/pre>\n<pre class=\"prism-code language-python\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token plain\">stemmer <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> SnowballStemmer<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">\"english\"<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">totalvocab_stemmed <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">totalvocab_tokenized <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> i <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> descriptions<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    allwords_stemmed <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> tokenize_and_stem<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">i<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    totalvocab_stemmed<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">extend<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">allwords_stemmed<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    allwords_tokenized <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> tokenize_only<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">i<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    totalvocab_tokenized<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">extend<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">allwords_tokenized<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">vocab_frame <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> pd<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">DataFrame<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">{<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'words'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> totalvocab_tokenized<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">}<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> index<\/span><span class=\"token operator\">=<\/span><span class=\"token plain\">totalvocab_stemmed<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">print<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'there are '<\/span><span class=\"token plain\"> <\/span><span class=\"token operator\">+<\/span><span class=\"token plain\"> <\/span><span class=\"token builtin\" style=\"color:rgb(189, 147, 249)\">str<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">vocab_frame<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">shape<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"> <\/span><span class=\"token operator\">+<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">' items in vocab_frame'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><\/p><\/pre>\n<pre class=\"prism-code language-\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token plain\">there are 481437 items in vocab_frame<\/span><\/p><\/pre>\n<h2 id=\"vectorizing-and-stemming\">\u0412\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u044f \u0438 \u0441\u0442\u0435\u043c\u043c\u0438\u043d\u0433<\/h2>\n<p>\u041f\u0440\u0435\u0436\u0434\u0435 \u0447\u0435\u043c \u043c\u044b \u0437\u0430\u0433\u0440\u0443\u0437\u0438\u043c \u0434\u0430\u043d\u043d\u044b\u0435 \u0432 \u0430\u043b\u0433\u043e\u0440\u0438\u0442\u043c K-\u0441\u0440\u0435\u0434\u043d\u0438\u0445, \u043d\u0435\u043e\u0431\u0445\u043e\u0434\u0438\u043c\u043e \u0438\u0445 \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u043e\u0432\u0430\u0442\u044c.  \u0421\u0430\u043c\u044b\u0439 \u043f\u043e\u043f\u0443\u043b\u044f\u0440\u043d\u044b\u0439 \u043c\u0435\u0442\u043e\u0434 \u2014 Tdidf Vectorizer, \u043a\u043e\u0442\u043e\u0440\u044b\u0439 \u0441\u043e\u0437\u0434\u0430\u0435\u0442 \u043c\u0430\u0442\u0440\u0438\u0446\u0443 \u043d\u0430 \u043e\u0441\u043d\u043e\u0432\u0435 \u0447\u0430\u0441\u0442\u043e\u0442\u044b \u0441\u043b\u043e\u0432 \u0432 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u0430\u0445, \u0438 \u0438\u043c\u0435\u043d\u043d\u043e \u0435\u0433\u043e \u043c\u044b \u0441\u043e\u0431\u0438\u0440\u0430\u0435\u043c\u0441\u044f \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c.  \u041f\u043e \u0441\u0443\u0442\u0438, \u044d\u0442\u043e \u043f\u043e\u043a\u0430\u0437\u044b\u0432\u0430\u0435\u0442, \u043d\u0430\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u0432\u0430\u0436\u043d\u043e \u0441\u043b\u043e\u0432\u043e \u0434\u043b\u044f \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u0430. \u0421\u0442\u043e\u0438\u0442 \u043e\u0442\u043c\u0435\u0442\u0438\u0442\u044c, \u0447\u0442\u043e \u0432 \u0431\u0443\u0434\u0443\u0449\u0435\u0439 \u0440\u0430\u0431\u043e\u0442\u0435 word2vec \u0438 doc2vec \u043c\u043e\u0433\u0443\u0442 \u0431\u044b\u0442\u044c \u0433\u043e\u0440\u0430\u0437\u0434\u043e \u0431\u043e\u043b\u0435\u0435 \u044d\u0444\u0444\u0435\u043a\u0442\u0438\u0432\u043d\u044b\u043c\u0438 \u0434\u043b\u044f \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u0438\u044f \u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u0439 \u043c\u0435\u0436\u0434\u0443 \u044d\u043b\u0435\u043c\u0435\u043d\u0442\u0430\u043c\u0438.<\/p>\n<pre class=\"prism-code language-python\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token plain\">tfidf_vectorizer <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> TfidfVectorizer<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">max_df<\/span><span class=\"token operator\">=<\/span><span class=\"token number\">0.8<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> max_features<\/span><span class=\"token operator\">=<\/span><span class=\"token number\">200000<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\">min_df<\/span><span class=\"token operator\">=<\/span><span class=\"token number\">0.2<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> stop_words<\/span><span class=\"token operator\">=<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'english'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">                                 use_idf<\/span><span class=\"token operator\">=<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> tokenizer<\/span><span class=\"token operator\">=<\/span><span class=\"token plain\">tokenize_and_stem<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> ngram_range<\/span><span class=\"token operator\">=<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token number\">3<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">tfidf_matrix <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> tfidf_vectorizer<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">fit_transform<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">descriptions<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">print<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'Td idf Matrix shape: '<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\">tfidf_matrix<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">shape<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">terms <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> tfidf_vectorizer<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">get_feature_names<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">dist <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token number\">1<\/span><span class=\"token plain\"> <\/span><span class=\"token operator\">-<\/span><span class=\"token plain\"> cosine_similarity<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">tfidf_matrix<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><\/p><\/pre>\n<pre class=\"prism-code language-\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token plain\">Td idf Matrix shape:  (1130, 74)<\/span><\/p><\/pre>\n<h2 id=\"k-means\">\u041a \u043e\u0437\u043d\u0430\u0447\u0430\u0435\u0442<\/h2>\n<p>\u0417\u0434\u0435\u0441\u044c \u043f\u0440\u043e\u0438\u0441\u0445\u043e\u0434\u0438\u0442 \u0444\u0430\u043a\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0438\u0437\u0430\u0446\u0438\u044f, \u0433\u0434\u0435 K \u043e\u0437\u043d\u0430\u0447\u0430\u0435\u0442 \u0441\u043e\u0437\u0434\u0430\u043d\u0438\u0435 5 \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u043e\u0432 \u043d\u0430 \u043e\u0441\u043d\u043e\u0432\u0435 \u043c\u0430\u0442\u0440\u0438\u0446\u044b Td-idf.  \u041c\u044b \u043c\u043e\u0436\u0435\u043c \u043b\u0435\u0433\u043a\u043e \u043f\u0440\u0435\u0434\u0441\u043a\u0430\u0437\u0430\u0442\u044c, \u0447\u0442\u043e \u044d\u0442\u043e \u043d\u0435 \u0431\u0443\u0434\u0435\u0442 \u043e\u043f\u0442\u0438\u043c\u0430\u043b\u044c\u043d\u044b\u043c \u0440\u0435\u0448\u0435\u043d\u0438\u0435\u043c, \u043f\u043e\u0442\u043e\u043c\u0443 \u0447\u0442\u043e \u043e\u043d\u043e \u0443\u0447\u0438\u0442\u044b\u0432\u0430\u0435\u0442 \u0442\u043e\u043b\u044c\u043a\u043e \u0447\u0430\u0441\u0442\u043e\u0442\u0443 \u043a\u0430\u0436\u0434\u043e\u0433\u043e \u0441\u043b\u043e\u0432\u0430 \u0432 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u0435.<\/p>\n<pre class=\"prism-code language-python\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token plain\">num_clusters <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token number\">5<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">km <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> KMeans<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">n_clusters<\/span><span class=\"token operator\">=<\/span><span class=\"token plain\">num_clusters<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">km<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">fit<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">tfidf_matrix<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">clusters <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> km<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">labels_<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">tolist<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><\/p><\/pre>\n<p>\u0427\u0442\u043e\u0431\u044b \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u0438\u0442\u044c \u043a\u043b\u0430\u0441\u0442\u0435\u0440, \u044f \u0441\u043e\u0437\u0434\u0430\u044e \u043a\u0430\u0434\u0440 \u0434\u0430\u043d\u043d\u044b\u0445 pandas, \u0438\u043d\u0434\u0435\u043a\u0441\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0439 \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0430\u043c\u0438.  \u041d\u0438\u0436\u0435 \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u044b \u043f\u0435\u0440\u0432\u044b\u0435 6 \u0441\u043b\u043e\u0432 \u043a\u0430\u0436\u0434\u043e\u0433\u043e \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0430.  \u041c\u044b \u0437\u0430\u043c\u0435\u0447\u0430\u0435\u043c, \u0447\u0442\u043e \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0438\u0437\u0430\u0446\u0438\u044f \u0434\u0430\u043b\u0435\u043a\u0430 \u043e\u0442 \u0441\u043e\u0432\u0435\u0440\u0448\u0435\u043d\u0441\u0442\u0432\u0430, \u043f\u043e\u0441\u043a\u043e\u043b\u044c\u043a\u0443 \u043d\u0435\u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u0441\u043b\u043e\u0432\u0430 \u0432\u0445\u043e\u0434\u044f\u0442 \u0431\u043e\u043b\u0435\u0435 \u0447\u0435\u043c \u0432 \u043e\u0434\u043d\u0443 \u0433\u0440\u0443\u043f\u043f\u0443.  \u0422\u0430\u043a\u0436\u0435 \u043d\u0435\u0442 \u0447\u0435\u0442\u043a\u043e\u0433\u043e \u0440\u0430\u0437\u0433\u0440\u0430\u043d\u0438\u0447\u0435\u043d\u0438\u044f \u0441\u043c\u044b\u0441\u043b\u043e\u0432\u043e\u0433\u043e \u0441\u043e\u0434\u0435\u0440\u0436\u0430\u043d\u0438\u044f \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u043e\u0432.  \u041c\u044b \u043b\u0435\u0433\u043a\u043e \u043c\u043e\u0436\u0435\u043c \u0432\u0438\u0434\u0435\u0442\u044c, \u0447\u0442\u043e \u0442\u0435\u0440\u043c\u0438\u043d\u044b, \u0441\u0432\u044f\u0437\u0430\u043d\u043d\u044b\u0435 \u0441 \u0440\u0430\u0431\u043e\u0442\u043e\u0439, \u0432\u043a\u043b\u044e\u0447\u0430\u044e\u0442 \u0431\u043e\u043b\u0435\u0435 \u043e\u0434\u043d\u043e\u0433\u043e \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0430.<\/p>\n<pre class=\"prism-code language-python\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token plain\">items <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">{<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'title'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> titles<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'description'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> descriptions<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">}<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">frame <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> pd<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">DataFrame<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">items<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> index <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">clusters<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> columns <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'title'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'cluster'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">print<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">\"Top terms per cluster:\"<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">order_centroids <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> km<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">cluster_centers_<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">argsort<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token operator\">-<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> i <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> <\/span><span class=\"token builtin\" style=\"color:rgb(189, 147, 249)\">range<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">num_clusters<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">print<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">\"Cluster %d words:\"<\/span><span class=\"token plain\"> <\/span><span class=\"token operator\">%<\/span><span class=\"token plain\"> i<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> end<\/span><span class=\"token operator\">=<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">''<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> ind <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> order_centroids<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">i<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token number\">6<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\">  <\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">        <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">print<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">' %s'<\/span><span class=\"token plain\"> <\/span><span class=\"token operator\">%<\/span><span class=\"token plain\"> vocab_frame<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">ix<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">terms<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">ind<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">split<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">' '<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">values<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">tolist<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> end<\/span><span class=\"token operator\">=<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">','<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">print<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><\/pre>\n<pre class=\"prism-code language-\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token plain\">Top terms per cluster:<\/span><\/p><p><span class=\"token plain\">Cluster 0 words: labour, employability, european, social, work, eu,<\/span><\/p><p><span class=\"token plain\">Cluster 1 words: occupational, sectors, skill, employability, services, workers,<\/span><\/p><p><span class=\"token plain\">Cluster 2 words: skill, job, labour, develop, market, cedefop,<\/span><\/p><p><span class=\"token plain\">Cluster 3 words: education, training, learning, vocational, education, cedefop,<\/span><\/p><p><span class=\"token plain\">Cluster 4 words: rates, unemployment, area, employability, increasingly, stated,<\/span><\/p><\/pre>\n<h2 id=\"visualization\">\u0412\u0438\u0437\u0443\u0430\u043b\u0438\u0437\u0430\u0446\u0438\u044f<\/h2>\n<p>\u0427\u0442\u043e\u0431\u044b \u0432\u0438\u0437\u0443\u0430\u043b\u0438\u0437\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0438\u0437\u0430\u0446\u0438\u044e, \u043c\u044b \u0434\u043e\u043b\u0436\u043d\u044b \u0441\u043d\u0430\u0447\u0430\u043b\u0430 \u0443\u043c\u0435\u043d\u044c\u0448\u0438\u0442\u044c \u0438\u0445 \u0440\u0430\u0437\u043c\u0435\u0440\u043d\u043e\u0441\u0442\u044c.  \u041c\u044b \u0434\u043e\u0441\u0442\u0438\u0433\u043b\u0438 \u044d\u0442\u043e\u0433\u043e \u0441 \u043f\u043e\u043c\u043e\u0449\u044c\u044e t-SNE (t-Distributed Stochastic Neighbor Embedding) \u0438\u0437 \u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438 sklearn.manifold.  \u0414\u0440\u0443\u0433\u043e\u0439 \u0441\u043f\u043e\u0441\u043e\u0431 \u2014 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c PCA \u0438\u043b\u0438 Multi-Demiensional Scaling (MDS).<\/p>\n<p>\u0413\u0440\u0430\u0444\u0438\u043a\u0430 \u0432\u044b\u043f\u043e\u043b\u043d\u044f\u0435\u0442\u0441\u044f \u0441 \u043f\u043e\u043c\u043e\u0449\u044c\u044e \u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438 matplotlib.<\/p>\n<pre class=\"prism-code language-python\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> os  <\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> matplotlib<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">pyplot <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">as<\/span><span class=\"token plain\"> plt<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> matplotlib <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">as<\/span><span class=\"token plain\"> mpl<\/span><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">from<\/span><span class=\"token plain\"> sklearn<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">manifold <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">import<\/span><span class=\"token plain\"> TSNE<\/span><\/p><p><span class=\"token plain\">tsne <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> TSNE<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">n_components<\/span><span class=\"token operator\">=<\/span><span class=\"token number\">2<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> verbose<\/span><span class=\"token operator\">=<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> perplexity<\/span><span class=\"token operator\">=<\/span><span class=\"token number\">40<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> n_iter<\/span><span class=\"token operator\">=<\/span><span class=\"token number\">300<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">pos <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> tsne<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">fit_transform<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">dist<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">xs<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> ys <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> pos<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> pos<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">cluster_colors <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">{<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'#1b9e77'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'#d95f02'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token number\">2<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'#7570b3'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token number\">3<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'#e7298a'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token number\">4<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'#66a61e'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">}<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">cluster_names <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">{<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'A'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'B'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\">  <\/span><span class=\"token number\">2<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'C'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> <\/span><span class=\"token number\">3<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'D'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\">  <\/span><span class=\"token number\">4<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"> <\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'E'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">}<\/span><\/p><\/pre>\n<pre class=\"prism-code language-\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token plain\">[t-SNE] Computing pairwise distances...<\/span><\/p><p><span class=\"token plain\">[t-SNE] Computing 121 nearest neighbors...<\/span><\/p><p><span class=\"token plain\">[t-SNE] Computed conditional probabilities for sample 1000 \/ 1130<\/span><\/p><p><span class=\"token plain\">[t-SNE] Computed conditional probabilities for sample 1130 \/ 1130<\/span><\/p><p><span class=\"token plain\">[t-SNE] Mean sigma: 1.785805<\/span><\/p><p><span class=\"token plain\">[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.947952<\/span><\/p><p><span class=\"token plain\">[t-SNE] Error after 125 iterations: 0.947952<\/span><\/p><\/pre>\n<pre class=\"prism-code language-python\" style=\"color:#F8F8F2;background-color:#282A36\"><p><span class=\"token operator\">%<\/span><span class=\"token plain\">matplotlib inline<\/span><\/p><p><span class=\"token plain\">df <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> pd<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">DataFrame<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token builtin\" style=\"color:rgb(189, 147, 249)\">dict<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">x<\/span><span class=\"token operator\">=<\/span><span class=\"token plain\">xs<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> y<\/span><span class=\"token operator\">=<\/span><span class=\"token plain\">ys<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> label<\/span><span class=\"token operator\">=<\/span><span class=\"token plain\">clusters<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> title<\/span><span class=\"token operator\">=<\/span><span class=\"token plain\">titles<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">groups <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> df<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">groupby<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'label'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">fig<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> ax <\/span><span class=\"token operator\">=<\/span><span class=\"token plain\"> plt<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">subplots<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">figsize<\/span><span class=\"token operator\">=<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token number\">16<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token number\">8<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"> <\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\"\/><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">for<\/span><span class=\"token plain\"> name<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> group <\/span><span class=\"token keyword\" style=\"color:rgb(189, 147, 249);font-style:italic\">in<\/span><span class=\"token plain\"> groups<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">:<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">    ax<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">plot<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">group<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">x<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> group<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">y<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> marker<\/span><span class=\"token operator\">=<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'o'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> linestyle<\/span><span class=\"token operator\">=<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">''<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> ms<\/span><span class=\"token operator\">=<\/span><span class=\"token number\">12<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">            label<\/span><span class=\"token operator\">=<\/span><span class=\"token plain\">cluster_names<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">name<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> color<\/span><span class=\"token operator\">=<\/span><span class=\"token plain\">cluster_colors<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">[<\/span><span class=\"token plain\">name<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">]<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">,<\/span><span class=\"token plain\"> mec<\/span><span class=\"token operator\">=<\/span><span class=\"token string\" style=\"color:rgb(255, 121, 198)\">'none'<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">ax<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">legend<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token plain\">numpoints<\/span><span class=\"token operator\">=<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><span class=\"token plain\"\/><\/p><p><span class=\"token plain\">plt<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">.<\/span><span class=\"token plain\">show<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">(<\/span><span class=\"token punctuation\" style=\"color:rgb(248, 248, 242)\">)<\/span><\/p><\/pre>\n<p><span class=\"gatsby-resp-image-wrapper\" style=\"position:relative;display:block;margin-left:auto;margin-right:auto;max-width:1200px\"><\/p>\n<p>    <span class=\"gatsby-resp-image-background-image\" style=\"padding-bottom:51%;position:relative;bottom:0;left:0;background-image:url('data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAKCAYAAAC0VX7mAAAACXBIWXMAAAsTAAALEwEAmpwYAAAChklEQVQoz01SS08TURTuj\/M3uGAtJCa6c6MbExPjSqMmxoVRW1BBE2lKeVQaeYqgQHm0t9hSYPqYtkzn0ek879zHZ6dg4t3ck\/Od852T7zsJVVVvUkovWBQVaURJSCnhjJEoiggbxixiRNGKZKeWJkHgEikEATiJYuy6jnN+rHZaTXJCniZM07wtpcT\/LxJs9F+GOgLuIwJDw61hQA0ElMO0vBEuOIPkfBRbnS7O\/1RmE71eb4KLUTLa0xr8+dGKKHUVwYQQYfdURO2K4KYqoJ4Ly7JFbo2IzGxR0LYt9KYuLNeJuynjIXq6Np3QDWNcShET8lOtIdf3F9DVWvAOltB9PYZ2\/g3s8ibM\/CtUvmUwl6ni8EBF7f0uOuUu6q0+KqUma6kDDCWbSTi2fcuNKBTX5htKUSpDIqv5G+Z2Er0fk6gXcohODVBbx97iMpbmCD5MFZBZOMFcuoTFbBnFI5U1GhbCMJxJUEdO1I0TPFx9wh+vf5aryjF2itvYVvYx3yhj6SCH5a0c1MgfaWW4Psj2BbbmCTbyVdQVM06PRA+CYCZhGv3xiPkw7A5fUUry\/s+0nKzsys12TT4r5OXL4pocW3kn18mxvOxq8uKsIXuzBdlLH8rzfFV2Lp3YURb7OiLUNG0cVybTGGg4JstUC+zBxlf24ug7y52VWFFXWVDvMT17zKyPu2zwKM\/86QPGDY\/JeDspg+sNP8Uu3x0aivh0uBBXdzOsUi0DeuCCM4HrgaBrZ+jfy8J5+wvcupIg7vt3dkPCbKJcLt\/wPG\/add2k4ziTrucl+wN7CownWRCmfN9POa6T8miQcttG0q61UyFY0g39SddxYzw5GAySQ7Ivw+Xu\/AXSR8xz9fgcJAAAAABJRU5ErkJggg==');background-size:cover;display:block\"\/><br \/>\n  <img decoding=\"async\" class=\"gatsby-resp-image-image\" alt=\"\u0420\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442 \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0438\u0437\u0430\u0446\u0438\u0438\" title=\"\u0420\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442 \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0438\u0437\u0430\u0446\u0438\u0438\" src=\"https:\/\/theaisummer.com\/static\/922dc7df629be7ed9fc4cc8729987642\/c1b63\/nlp_20_0.png\" srcset=\"\/static\/922dc7df629be7ed9fc4cc8729987642\/5a46d\/nlp_20_0.png 300w,\/static\/922dc7df629be7ed9fc4cc8729987642\/0a47e\/nlp_20_0.png 600w,\/static\/922dc7df629be7ed9fc4cc8729987642\/c1b63\/nlp_20_0.png 1200w,\/static\/922dc7df629be7ed9fc4cc8729987642\/46e30\/nlp_20_0.png 1303w\" sizes=\"(max-width: 1200px) 100vw, 1200px\" style=\"width:100%;height:100%;margin:0;vertical-align:middle;position:absolute;top:0;left:0\" loading=\"lazy\"\/><\/p>\n<p>    <\/span><\/p>\n<p>\u041c\u044b \u0432\u0438\u0434\u0438\u043c, \u0447\u0442\u043e \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u044b \u043d\u0435 \u0442\u0430\u043a \u043f\u043b\u043e\u0445\u0438, \u043a\u0430\u043a \u043c\u044b \u0434\u0443\u043c\u0430\u043b\u0438 \u0438\u0437\u043d\u0430\u0447\u0430\u043b\u044c\u043d\u043e.  \u0425\u043e\u0442\u044f \u0435\u0441\u0442\u044c \u043d\u0435\u043a\u043e\u0442\u043e\u0440\u043e\u0435 \u0447\u0430\u0441\u0442\u0438\u0447\u043d\u043e\u0435 \u0441\u043e\u0432\u043f\u0430\u0434\u0435\u043d\u0438\u0435, \u0433\u0440\u0443\u043f\u043f\u044b \u0432\u0435\u0441\u044c\u043c\u0430 \u0440\u0430\u0437\u043b\u0438\u0447\u0430\u044e\u0442\u0441\u044f.  \u041e\u0434\u043d\u0430\u043a\u043e \u043d\u0435\u0442 \u043d\u0438\u043a\u0430\u043a\u0438\u0445 \u0441\u043e\u043c\u043d\u0435\u043d\u0438\u0439 \u0432 \u0442\u043e\u043c, \u0447\u0442\u043e \u043c\u044b \u043c\u043e\u0436\u0435\u043c \u043e\u043f\u0442\u0438\u043c\u0438\u0437\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u0438\u0445 \u0433\u043e\u0440\u0430\u0437\u0434\u043e \u0434\u0430\u043b\u044c\u0448\u0435.<\/p>\n<p>\u0421\u043b\u0435\u0434\u0443\u0435\u0442 \u043e\u0442\u043c\u0435\u0442\u0438\u0442\u044c, \u0447\u0442\u043e \u044d\u043b\u0435\u043c\u0435\u043d\u0442\u044b \u0441 \u043d\u0435\u0441\u043a\u043e\u043b\u044c\u043a\u0438\u043c\u0438 \u0441\u043b\u043e\u0432\u0430\u043c\u0438 \u043d\u0435 \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u044b \u043d\u0430 \u0433\u0440\u0430\u0444\u0438\u043a\u0435.  \u042f \u0442\u0430\u043a\u0436\u0435 \u0437\u0430\u043c\u0435\u0442\u0438\u043b, \u0447\u0442\u043e \u043d\u0435\u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u044d\u043b\u0435\u043c\u0435\u043d\u0442\u044b \u043d\u0430\u043f\u0438\u0441\u0430\u043d\u044b \u043d\u0430 \u044f\u0437\u044b\u043a\u0435, \u043e\u0442\u043b\u0438\u0447\u043d\u043e\u043c \u043e\u0442 \u0430\u043d\u0433\u043b\u0438\u0439\u0441\u043a\u043e\u0433\u043e.  \u0412 \u043d\u0430\u0441\u0442\u043e\u044f\u0449\u0435\u0435 \u0432\u0440\u0435\u043c\u044f \u043c\u044b \u043d\u0435 \u043e\u0431\u0440\u0430\u0431\u0430\u0442\u044b\u0432\u0430\u0435\u043c \u0438\u0445, \u0438 \u043f\u043e\u044d\u0442\u043e\u043c\u0443 \u0438\u0445 \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u044f \u0444\u0430\u043a\u0442\u0438\u0447\u0435\u0441\u043a\u0438 \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u0430.  \u041d\u0430 \u0434\u0438\u0430\u0433\u0440\u0430\u043c\u043c\u0435 \u043d\u0435\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u043d\u0435\u0443\u043c\u0435\u0441\u0442\u043d\u044b\u0445 \u0442\u043e\u0447\u0435\u043a.<\/p>\n<p>\u041a\u0440\u043e\u043c\u0435 \u0442\u043e\u0433\u043e, \u043f\u0440\u0435\u0434\u0441\u0442\u043e\u0438\u0442 \u0435\u0449\u0435 \u043c\u043d\u043e\u0433\u043e \u0440\u0430\u0431\u043e\u0442\u044b \u043f\u043e \u043e\u0447\u0438\u0441\u0442\u043a\u0435 \u0438 \u043f\u0440\u0435\u0434\u0432\u0430\u0440\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0439 \u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0435 \u0434\u0430\u043d\u043d\u044b\u0445.<\/p>\n<p>\u041e\u0434\u0438\u043d \u0438\u0437 \u0441\u043f\u043e\u0441\u043e\u0431\u043e\u0432 \u2014 \u043e\u043f\u0442\u0438\u043c\u0438\u0437\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u044b \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438 tdidf, \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0434\u043b\u044f \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438 doc2vec.  \u0418\u043b\u0438 \u043c\u044b \u043c\u043e\u0436\u0435\u043c \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0434\u0440\u0443\u0433\u043e\u0439 \u043c\u0435\u0442\u043e\u0434, \u0442\u0430\u043a\u043e\u0439 \u043a\u0430\u043a Affinity Propagation, Spectra Clustering \u0438\u043b\u0438 \u0431\u043e\u043b\u0435\u0435 \u0441\u043e\u0432\u0440\u0435\u043c\u0435\u043d\u043d\u044b\u0435 \u043c\u0435\u0442\u043e\u0434\u044b, \u0442\u0430\u043a\u0438\u0435 \u043a\u0430\u043a HDBSCAN \u0438 Variational Autoencoders, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u044f \u0445\u043e\u0442\u0435\u043b \u0431\u044b \u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u0430\u0442\u044c.<\/p>\n<p>PS: \u0427\u0442\u043e\u0431\u044b \u0437\u0430\u043f\u0443\u0441\u0442\u0438\u0442\u044c \u043a\u043e\u0434, \u0432\u044b \u043c\u043e\u0436\u0435\u0442\u0435 \u0441\u0434\u0435\u043b\u0430\u0442\u044c \u044d\u0442\u043e \u043f\u0440\u044f\u043c\u043e \u0438\u0437 jupyter, \u0435\u0441\u043b\u0438 \u0443\u0441\u0442\u0430\u043d\u043e\u0432\u043b\u0435\u043d\u044b \u043d\u0435\u043e\u0431\u0445\u043e\u0434\u0438\u043c\u044b\u0435 \u0437\u0430\u0432\u0438\u0441\u0438\u043c\u043e\u0441\u0442\u0438, \u0438\u043b\u0438 \u0432\u044b \u043c\u043e\u0436\u0435\u0442\u0435 \u044d\u043a\u0441\u043f\u043e\u0440\u0442\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u0435\u0433\u043e \u0432 \u0432\u0438\u0434\u0435 \u0444\u0430\u0439\u043b\u0430 .py \u0438 \u0437\u0430\u043f\u0443\u0441\u0442\u0438\u0442\u044c \u0441 \u043f\u043e\u043c\u043e\u0449\u044c\u044e ide \u0438\u043b\u0438 \u043d\u0430\u043f\u0440\u044f\u043c\u0443\u044e \u0447\u0435\u0440\u0435\u0437 \u043a\u043e\u043d\u0441\u043e\u043b\u044c.<\/p>\n<div class=\"dl-prod-book-inline-banner\">\n<div class=\"dl-prod-book-inline-banner__image gatsby-image-wrapper\" style=\"position:relative;overflow:hidden\"><img decoding=\"async\" aria-hidden=\"true\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAZCAYAAAAxFw7TAAAACXBIWXMAAAsTAAALEwEAmpwYAAAFzklEQVQ4y1WU2VOb1xnGzyfVbt1O6tST4AVjglc2s1sskgxC8GlBIAmQkMRugtlBbAJjwAKMDYbYYOyMSTJt46YzSV1PO2k7uWimnfauF532or3oTC\/7L3R6+ev7SaGeXjxzzrf9znOe9\/2OUtm1qAs3UVl2tOybmLJlvGBDy7Kina9JyZRSNWaRKVPGzCq0c5WYDJ21oE6Xp5VRhlI5TtR79WmdrkG9UykPqmQuOlONelc+OCUvn5KX3y5B\/UD0drG8Z9wrRZ0sFri8a4DPiNRlF2aRAQyPrxNff87U2gFtw6tU+keIjCWJTawzvvKY2Y0Dhha2GVnaQY\/FCQ3d5dbsfTFQgSlLzIh7pa75UBdFWW5qe1fpTuzROb\/PVX2Y0uA0ocltAiMbtE9sEZ3ZJTi6SfPQBs7uRapDs\/iH1sStwM7ZUedtAsz1425uIhDw4vU6OVdSj8ftoMxWj8\/bwPVqB5lFdgotdhx1VtwNVoJuK9fK7ehOGzeqqrHW1lNgcXDich1KK2zn4wUPP7zj4XfbLg4THkZ7mni55ObpjJsv111M9Xn40xOdtqAH3ethc9TFyzs6S+\/r7Md1\/rDVwMsFJyfzpB7HSjqo8wXJqGynJ9aC2+9nsLuZtrbmFLjR18xkvwe\/30NvxEul04tN99Ag4NsxF5FWF\/W6Tle7zndzddlySSeqMIa6HkXlRVD5HaIwKleUJ7oYxJwXSs8vNcv9AOpKi8xFl+X6klGDJimqV649AizrRSvrQZV2YyrtRCuJoRVHMZfIAvltnKrpZ2f2EYVTEr61H5PAtII2TPmtaHkBNKmBltuCKVfgRoFVxQCq\/JaoH1XWh7GAAVeFEU44pjmdPGQucR9HZBLL4h6ZvoS4bJXn4rgglFpU5QXTzgWuVOUQyjKIuvE+R3BzWT9V\/juci39IxsAqnzbO0FQywFnnAGUrh1wJJMWlRHO94w24oD0NVzVjqOpRVNUIpqrhFNTqWmR5ag9\/KMFaU4KvJp9i8cxhzumg0NZLT\/QhVyvku6IIWpFEcz3yBq7scZRtSvKZQKsZF\/AYbk+cj1pn+bN+l78PPqY3usj3yjo4W9xLpkDHJp7SPvYzcSTAUsm\/uEskxS2S4qraedTNWdEMJvu0wOP01Y\/yr+ga\/+zZpcEhkRRIPvkBsotDXL0Wxj\/5gqEP\/yKuelI7Mkv+ZqO4BlzV30E5FlB1CUx1afi9lkW+GNgkw3FbYEGOF4c5WRqmqCjM3PkAv50\/YPfVP8gqHybTcpu3JH\/zUWGVvopqWEY575KCO5fIk9yOX5B\/M8fD8Ypuvl8RJV968UVBN\/+Ww4CvP+WvyedU3JjkRPUI\/1dY5d1AuaXHXPdIwRu\/gVtkVeOczGkkp7yTr9sm4bND+M0hf5u7z4B1ireMzK2i6rFUUZVRVNX8AOXbRDUJ2LMucGkJgWvG3LnIaWlW50U3fzx4xn9+\/3O+jK4SqZrnlMSkORJoRv5G9rKAqpkQoH8L1fKQFPgI3iJjYBtTaI9vh5+SaRujtGaQYNMyDlkkuy2JCm+jWh+ljegr\/K8WKZgBaN5MSZO56UjGvcADLi39iNatz7jSucV3\/Gt8yye78Bm7WMakG7nPCUxc1o4bW06DlO9+esuNRpbiQJexRa7DmxzresC127uUDz\/h3S5x1pRMO7LLb2g1YIvfdIp0iQHSDJhrjY6t1+grP6XjyS9p2X2NY\/MLYge\/5uba57Q+ek388Cv6nv0Kz84rIvu\/IPrgc2pnDjmmL6XaLr1lo8qpSic5G9sls2+P4AeviG3\/BHviCRVy7NfM7BBc2Wf00QtCy7s4kp9gufdj3uvbJVt+Q5PkmgYmDKC0jCeZVuNKKuCs4cf0LD\/j4foMQ4tTDC8MMZ24JadOHwtLg1intzgzsifZiSPb7BtYastHMKNdjFECvzoubjY+YmL1OcH5PYpHdsjo2uKdzodcuLVNSXyf7DEBSlG0+oU3sLo5\/gubMZbW7M93hgAAAABJRU5ErkJggg==\" alt=\"\u041a\u043d\u0438\u0433\u0430 \u00ab\u0413\u043b\u0443\u0431\u043e\u043a\u043e\u0435 \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u0435 \u0432 \u043f\u0440\u043e\u0438\u0437\u0432\u043e\u0434\u0441\u0442\u0432\u0435\u00bb\" style=\"position:absolute;top:0;left:0;width:100%;height:100%;object-fit:contain;object-position:center;opacity:1;transition-delay:500ms\"\/><noscript><picture><source srcset=\"https:\/\/theaisummer.com\/static\/502e7c498dd9d981ac44c1dcd10f9276\/69585\/deep-learning-book-cover.png 200w,&#10;https:\/\/theaisummer.com\/static\/502e7c498dd9d981ac44c1dcd10f9276\/497c6\/deep-learning-book-cover.png 400w,&#10;https:\/\/theaisummer.com\/static\/502e7c498dd9d981ac44c1dcd10f9276\/3c17d\/deep-learning-book-cover.png 720w\" sizes=\"(max-width: 720px) 100vw, 720px\"\/><img decoding=\"async\" loading=\"lazy\" sizes=\"(max-width: 720px) 100vw, 720px\" srcset=\"https:\/\/theaisummer.com\/static\/502e7c498dd9d981ac44c1dcd10f9276\/69585\/deep-learning-book-cover.png 200w,&#10;https:\/\/theaisummer.com\/static\/502e7c498dd9d981ac44c1dcd10f9276\/497c6\/deep-learning-book-cover.png 400w,&#10;https:\/\/theaisummer.com\/static\/502e7c498dd9d981ac44c1dcd10f9276\/3c17d\/deep-learning-book-cover.png 720w\" src=\"https:\/\/theaisummer.com\/static\/502e7c498dd9d981ac44c1dcd10f9276\/3c17d\/deep-learning-book-cover.png\" alt=\"\u041a\u043d\u0438\u0433\u0430 \u00ab\u0413\u043b\u0443\u0431\u043e\u043a\u043e\u0435 \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u0435 \u0432 \u043f\u0440\u043e\u0438\u0437\u0432\u043e\u0434\u0441\u0442\u0432\u0435\u00bb\" style=\"position:absolute;top:0;left:0;opacity:1;width:100%;height:100%;object-fit:cover;object-position:center\"\/><\/picture><\/noscript><\/div>\n<div class=\"dl-prod-book-inline-banner__text\">\n<h2>\u041a\u043d\u0438\u0433\u0430 \u00ab\u0413\u043b\u0443\u0431\u043e\u043a\u043e\u0435 \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u0435 \u0432 \u043f\u0440\u043e\u0438\u0437\u0432\u043e\u0434\u0441\u0442\u0432\u0435\u00bb \ud83d\udcd6<\/h2>\n<h4>\u0423\u0437\u043d\u0430\u0439\u0442\u0435, \u043a\u0430\u043a \u0441\u043e\u0437\u0434\u0430\u0432\u0430\u0442\u044c, \u043e\u0431\u0443\u0447\u0430\u0442\u044c, \u0440\u0430\u0437\u0432\u0435\u0440\u0442\u044b\u0432\u0430\u0442\u044c, \u043c\u0430\u0441\u0448\u0442\u0430\u0431\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u0438 \u043f\u043e\u0434\u0434\u0435\u0440\u0436\u0438\u0432\u0430\u0442\u044c \u043c\u043e\u0434\u0435\u043b\u0438 \u0433\u043b\u0443\u0431\u043e\u043a\u043e\u0433\u043e \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f.  \u0418\u0437\u0443\u0447\u0438\u0442\u0435 \u0438\u043d\u0444\u0440\u0430\u0441\u0442\u0440\u0443\u043a\u0442\u0443\u0440\u0443 \u043c\u0430\u0448\u0438\u043d\u043d\u043e\u0433\u043e \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u0438 MLOps \u043d\u0430 \u043f\u0440\u0430\u043a\u0442\u0438\u0447\u0435\u0441\u043a\u0438\u0445 \u043f\u0440\u0438\u043c\u0435\u0440\u0430\u0445.<\/h4>\n<p>\u0423\u0437\u043d\u0430\u0442\u044c \u0431\u043e\u043b\u044c\u0448\u0435<\/p><\/div>\n<\/div>\n<p><em class=\"affiliate-disclosure\">* \u0420\u0430\u0441\u043a\u0440\u044b\u0442\u0438\u0435 \u0438\u043d\u0444\u043e\u0440\u043c\u0430\u0446\u0438\u0438: \u041e\u0431\u0440\u0430\u0442\u0438\u0442\u0435 \u0432\u043d\u0438\u043c\u0430\u043d\u0438\u0435, \u0447\u0442\u043e \u043d\u0435\u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u0438\u0437 \u043f\u0440\u0438\u0432\u0435\u0434\u0435\u043d\u043d\u044b\u0445 \u0432\u044b\u0448\u0435 \u0441\u0441\u044b\u043b\u043e\u043a \u043c\u043e\u0433\u0443\u0442 \u0431\u044b\u0442\u044c \u043f\u0430\u0440\u0442\u043d\u0435\u0440\u0441\u043a\u0438\u043c\u0438 \u0441\u0441\u044b\u043b\u043a\u0430\u043c\u0438, \u0438 \u043c\u044b \u0431\u0435\u0437 \u0434\u043e\u043f\u043e\u043b\u043d\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0445 \u0437\u0430\u0442\u0440\u0430\u0442 \u0434\u043b\u044f \u0432\u0430\u0441 \u043f\u043e\u043b\u0443\u0447\u0438\u043c \u043a\u043e\u043c\u0438\u0441\u0441\u0438\u044e, \u0435\u0441\u043b\u0438 \u0432\u044b \u0440\u0435\u0448\u0438\u0442\u0435 \u0441\u043e\u0432\u0435\u0440\u0448\u0438\u0442\u044c \u043f\u043e\u043a\u0443\u043f\u043a\u0443 \u043f\u043e\u0441\u043b\u0435 \u043f\u0435\u0440\u0435\u0445\u043e\u0434\u0430 \u043f\u043e \u0441\u0441\u044b\u043b\u043a\u0435.<\/em><\/p>\n<\/div>\n","protected":false},"excerpt":{"rendered":"<p>\u041a\u043b\u0430\u0441\u0442\u0435\u0440\u0438\u0437\u0430\u0446\u0438\u044f \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432 \u2014 \u044d\u0442\u043e \u0437\u0430\u0434\u0430\u0447\u0430 \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432 \u043f\u043e \u0440\u0430\u0437\u043d\u044b\u043c \u0433\u0440\u0443\u043f\u043f\u0430\u043c \u043d\u0430 \u043e\u0441\u043d\u043e\u0432\u0435 \u0438\u0445 \u0442\u0435\u043a\u0441\u0442\u043e\u0432\u043e\u0433\u043e \u0438 \u0441\u0435\u043c\u0430\u043d\u0442\u0438\u0447\u0435\u0441\u043a\u043e\u0433\u043e \u043a\u043e\u043d\u0442\u0435\u043a\u0441\u0442\u0430. \u042d\u0442\u043e \u043d\u0435\u043a\u043e\u043d\u0442\u0440\u043e\u043b\u0438\u0440\u0443\u0435\u043c\u044b\u0439 \u043c\u0435\u0442\u043e\u0434, \u043f\u043e\u0441\u043a\u043e\u043b\u044c\u043a\u0443 \u0443 \u043d\u0430\u0441 \u043d\u0435\u0442 \u043c\u0435\u0442\u043e\u043a \u0434\u043b\u044f \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432, \u0438 \u043e\u043d \u043f\u0440\u0438\u043c\u0435\u043d\u044f\u0435\u0442\u0441\u044f \u0432 \u0438\u043d\u0444\u043e\u0440\u043c\u0430\u0446\u0438\u043e\u043d\u043d\u043e\u043c \u043f\u043e\u0438\u0441\u043a\u0435 \u0438 \u043f\u043e\u0438\u0441\u043a\u043e\u0432\u044b\u0445 \u0441\u0438\u0441\u0442\u0435\u043c\u0430\u0445. \u0414\u0430\u0432\u0430\u0439\u0442\u0435 \u043d\u0430\u0447\u043d\u0435\u043c&#8230; \u0427\u0442\u043e\u0431\u044b \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u0446\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u044b \u043d\u0430 \u043e\u0441\u043d\u043e\u0432\u0435 \u0438\u0445 \u0441\u043e\u0434\u0435\u0440\u0436\u0430\u043d\u0438\u044f, \u044f \u0440\u0435\u0448\u0438\u043b \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0430\u043b\u0433\u043e\u0440\u0438\u0442\u043c \u041a-\u0441\u0440\u0435\u0434\u043d\u0438\u0445. \u0418\u0437-\u0437\u0430 \u0442\u043e\u0433\u043e, \u0447\u0442\u043e \u044d\u043b\u0435\u043c\u0435\u043d\u0442\u044b \u043d\u0435 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":1618,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[],"class_list":{"0":"post-1617","1":"post","2":"type-post","3":"status-publish","4":"format-standard","5":"has-post-thumbnail","7":"category-ai-research-and-news"},"_links":{"self":[{"href":"https:\/\/gptmain.news\/index.php?rest_route=\/wp\/v2\/posts\/1617","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/gptmain.news\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/gptmain.news\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/gptmain.news\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/gptmain.news\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=1617"}],"version-history":[{"count":0,"href":"https:\/\/gptmain.news\/index.php?rest_route=\/wp\/v2\/posts\/1617\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/gptmain.news\/index.php?rest_route=\/wp\/v2\/media\/1618"}],"wp:attachment":[{"href":"https:\/\/gptmain.news\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=1617"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/gptmain.news\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=1617"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/gptmain.news\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=1617"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}