Add BERTopic.

This commit is contained in:
戒酒的李白
2025-08-12 19:01:20 +08:00
parent e2323d579c
commit c5c530775e
256 changed files with 28666 additions and 0 deletions
@@ -0,0 +1,32 @@
<svg width="228" height="113" viewBox="0 0 228 113" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M68.7889 40.7606L54.4174 26.3594C54.1819 26.1238 53.8638 26 53.5317 26H16.34C14.4403 26 12.8962 27.5352 12.8962 29.4337L12.8765 92.5594C12.8765 94.4578 14.4219 96 16.3209 96H65.6905C67.5889 96 69.1343 94.459 69.1349 92.5613L69.1533 41.6413C69.1533 41.3098 69.0225 40.9949 68.7889 40.7606ZM66.634 92.5606C66.634 93.0806 66.2105 93.501 65.6905 93.501H16.3209C15.8003 93.501 15.3768 93.0844 15.3768 92.5644L15.3965 29.4362C15.3965 28.9162 15.8194 28.5003 16.34 28.5003H53.013L66.6517 42.1632L66.634 92.5606Z" fill="black"/>
<path d="M62.2626 40.3752H57.1876C55.8613 40.3752 54.7508 39.3098 54.7508 37.9835V27.2343C54.7508 26.5435 54.1908 25.9841 53.5006 25.9841C52.8105 25.9841 52.2505 26.5441 52.2505 27.2343V37.9835C52.2505 40.6889 54.4816 42.8749 57.187 42.8749H62.2619C62.9521 42.8749 63.5127 42.3162 63.5127 41.6254C63.5127 40.9346 62.9527 40.3752 62.2626 40.3752Z" fill="black"/>
<path d="M78.7584 30.7822L64.387 16.374C64.1514 16.1384 63.8333 16 63.5019 16H26.3095C24.4105 16 22.8746 17.5581 22.8746 19.4571V27.2343C22.8746 27.9251 23.434 28.4844 24.1248 28.4844C24.8156 28.4844 25.3749 27.9244 25.3749 27.2343V19.4571C25.3749 18.9371 25.7902 18.5003 26.3102 18.5003H62.9838L76.6232 32.1689L76.6041 82.574C76.6041 83.0933 76.1813 83.4997 75.6613 83.4997H67.8841C67.1933 83.4997 66.634 84.0597 66.634 84.7498C66.634 85.44 67.1933 86 67.8841 86H75.6613C77.5603 86 79.1044 84.4717 79.1038 82.5733L79.1235 31.6667C79.1235 31.3352 78.9914 31.0165 78.7584 30.7822Z" fill="black"/>
<path d="M72.2333 30.3746H67.1584C65.8321 30.3746 64.7508 29.339 64.7508 28.0127V17.2635C64.7508 16.5733 64.1908 16.0133 63.5006 16.0133C62.8105 16.0133 62.2505 16.5733 62.2505 17.2635V28.0127C62.2505 30.7181 64.453 32.8749 67.1578 32.8749H72.2327C72.9229 32.8749 73.4835 32.3156 73.4835 31.6248C73.4835 30.934 72.9235 30.3746 72.2333 30.3746Z" fill="black"/>
<path d="M22.7838 46.6248H19.7413C19.0511 46.6248 18.4911 47.1841 18.4911 47.8749C18.4911 48.5657 19.0511 49.1251 19.7413 49.1251H22.7838C23.4733 49.1251 24.034 48.5657 24.034 47.8749C24.034 47.1841 23.4733 46.6248 22.7838 46.6248Z" fill="black"/>
<path d="M62.2429 46.6248H28.3991C27.7076 46.6248 27.1489 47.1841 27.1489 47.8749C27.1489 48.5657 27.7083 49.1251 28.3991 49.1251H62.2429C62.9337 49.1251 63.493 48.5657 63.493 47.8749C63.493 47.1841 62.9337 46.6248 62.2429 46.6248Z" fill="black"/>
<path d="M62.2429 52.8749H52.7603C52.0695 52.8749 51.5102 53.4343 51.5102 54.1251C51.5102 54.8159 52.0695 55.3752 52.7603 55.3752H62.2429C62.9337 55.3752 63.493 54.8159 63.493 54.1251C63.493 53.4343 62.9337 52.8749 62.2429 52.8749Z" fill="black"/>
<path d="M47.1457 52.8749H19.7419C19.0518 52.8749 18.4918 53.4343 18.4918 54.1251C18.4918 54.8159 19.0518 55.3752 19.7419 55.3752H47.1457C47.8353 55.3752 48.3959 54.8159 48.3959 54.1251C48.3959 53.4343 47.8353 52.8749 47.1457 52.8749Z" fill="black"/>
<path d="M62.2429 59.1245H19.7419C19.0518 59.1245 18.4918 59.6845 18.4918 60.3746C18.4918 61.0648 19.0518 61.6248 19.7419 61.6248H62.2429C62.9337 61.6248 63.493 61.0648 63.493 60.3746C63.493 59.6845 62.9337 59.1245 62.2429 59.1245Z" fill="black"/>
<path d="M62.2429 77.8749H19.7419C19.0518 77.8749 18.4918 78.4349 18.4918 79.1251C18.4918 79.8152 19.0518 80.3752 19.7419 80.3752H62.2429C62.9337 80.3752 63.493 79.8152 63.493 79.1251C63.493 78.4349 62.9337 77.8749 62.2429 77.8749Z" fill="black"/>
<path d="M22.7838 65.3746H19.7413C19.0511 65.3746 18.4911 65.9346 18.4911 66.6248C18.4911 67.3149 19.0511 67.8749 19.7413 67.8749H22.7838C23.4733 67.8749 24.034 67.3149 24.034 66.6248C24.034 65.9346 23.4733 65.3746 22.7838 65.3746Z" fill="black"/>
<path d="M62.2429 65.3746H28.3991C27.7076 65.3746 27.1489 65.9346 27.1489 66.6248C27.1489 67.3149 27.7083 67.8749 28.3991 67.8749H62.2429C62.9337 67.8749 63.493 67.3149 63.493 66.6248C63.493 65.9346 62.9337 65.3746 62.2429 65.3746Z" fill="black"/>
<path d="M62.2429 71.6248H52.7603C52.0695 71.6248 51.5102 72.1848 51.5102 72.8749C51.5102 73.5651 52.0695 74.1251 52.7603 74.1251H62.2429C62.9337 74.1251 63.493 73.5651 63.493 72.8749C63.493 72.1848 62.9337 71.6248 62.2429 71.6248Z" fill="black"/>
<path d="M47.1457 71.6248H19.7419C19.0518 71.6248 18.4918 72.1848 18.4918 72.8749C18.4918 73.5651 19.0518 74.1251 19.7419 74.1251H47.1457C47.8353 74.1251 48.3959 73.5651 48.3959 72.8749C48.3959 72.1848 47.8353 71.6248 47.1457 71.6248Z" fill="black"/>
<path d="M22.7838 84.1245H19.7413C19.0511 84.1245 18.4911 84.6845 18.4911 85.3746C18.4911 86.0648 19.0511 86.6248 19.7413 86.6248H22.7838C23.4733 86.6248 24.034 86.0648 24.034 85.3746C24.034 84.6845 23.4733 84.1245 22.7838 84.1245Z" fill="black"/>
<path d="M62.2429 84.1245H28.3991C27.7076 84.1245 27.1489 84.6845 27.1489 85.3746C27.1489 86.0648 27.7083 86.6248 28.3991 86.6248H62.2429C62.9337 86.6248 63.493 86.0648 63.493 85.3746C63.493 84.6845 62.9337 84.1245 62.2429 84.1245Z" fill="black"/>
<path d="M72.2143 36.6248H64.7952C64.1044 36.6248 63.5451 37.1841 63.5451 37.8749C63.5451 38.5657 64.1044 39.1251 64.7952 39.1251H72.2136C72.9044 39.1251 73.4644 38.5657 73.4644 37.8749C73.4644 37.1841 72.9051 36.6248 72.2143 36.6248Z" fill="black"/>
<path d="M72.2137 42.8749H67.8841C67.1933 42.8749 66.634 43.4343 66.634 44.1251C66.634 44.8159 67.1933 45.3752 67.8841 45.3752H72.2137C72.9044 45.3752 73.4638 44.8159 73.4638 44.1251C73.4638 43.4343 72.9044 42.8749 72.2137 42.8749Z" fill="black"/>
<path d="M72.2137 49.1245H67.8841C67.1933 49.1245 66.634 49.6838 66.634 50.3746C66.634 51.0654 67.1933 51.6248 67.8841 51.6248H72.2137C72.9044 51.6248 73.4638 51.0654 73.4638 50.3746C73.4638 49.6838 72.9044 49.1245 72.2137 49.1245Z" fill="black"/>
<path d="M72.2136 67.8749H68.267C67.5775 67.8749 67.0168 68.4349 67.0168 69.1251C67.0168 69.8152 67.5775 70.3752 68.267 70.3752H72.2136C72.9044 70.3752 73.4638 69.8152 73.4638 69.1251C73.4638 68.4349 72.9044 67.8749 72.2136 67.8749Z" fill="black"/>
<path d="M72.2137 55.3746H67.8841C67.1933 55.3746 66.634 55.9346 66.634 56.6248C66.634 57.3149 67.1933 57.8749 67.8841 57.8749H72.2137C72.9044 57.8749 73.4638 57.3149 73.4638 56.6248C73.4638 55.934 72.9044 55.3746 72.2137 55.3746Z" fill="black"/>
<path d="M72.2137 61.6248H67.8841C67.1933 61.6248 66.634 62.1848 66.634 62.8749C66.634 63.5651 67.1933 64.1251 67.8841 64.1251H72.2137C72.9044 64.1251 73.4638 63.5651 73.4638 62.8749C73.4638 62.1848 72.9044 61.6248 72.2137 61.6248Z" fill="black"/>
<path d="M72.2137 74.1244H67.8841C67.1933 74.1244 66.634 74.6844 66.634 75.3746C66.634 76.0648 67.1933 76.6248 67.8841 76.6248H72.2137C72.9044 76.6248 73.4638 76.0648 73.4638 75.3746C73.4638 74.6844 72.9044 74.1244 72.2137 74.1244Z" fill="black"/>
<path d="M155.061 57.0607C155.646 56.4749 155.646 55.5251 155.061 54.9393L145.515 45.3934C144.929 44.8076 143.979 44.8076 143.393 45.3934C142.808 45.9792 142.808 46.9289 143.393 47.5147L151.879 56L143.393 64.4853C142.808 65.0711 142.808 66.0208 143.393 66.6066C143.979 67.1924 144.929 67.1924 145.515 66.6066L155.061 57.0607ZM98 57.5H154V54.5H98V57.5Z" fill="black"/>
<path d="M189 13H180V103H189" stroke="black" stroke-width="2"/>
<path d="M204 13H213V103H204" stroke="black" stroke-width="2"/>
<path d="M194.746 16.6543L196 19.2148L198.062 16.666H198.918L196.322 19.8066L197.98 23H197.219L195.883 20.3281L193.721 23H192.871L195.572 19.7305L193.984 16.6543H194.746ZM194.746 30.6543L196 33.2148L198.062 30.666H198.918L196.322 33.8066L197.98 37H197.219L195.883 34.3281L193.721 37H192.871L195.572 33.7305L193.984 30.6543H194.746ZM194.898 50.5723C194.902 50.4395 194.953 50.3242 195.051 50.2266C195.148 50.1289 195.266 50.0781 195.402 50.0742C195.543 50.0742 195.658 50.1211 195.748 50.2148C195.838 50.3086 195.879 50.4258 195.871 50.5664C195.863 50.7031 195.811 50.8184 195.713 50.9121C195.615 51.0059 195.498 51.0527 195.361 51.0527C195.221 51.0566 195.105 51.0137 195.016 50.9238C194.926 50.8301 194.887 50.7129 194.898 50.5723ZM194.898 64.5723C194.902 64.4395 194.953 64.3242 195.051 64.2266C195.148 64.1289 195.266 64.0781 195.402 64.0742C195.543 64.0742 195.658 64.1211 195.748 64.2148C195.838 64.3086 195.879 64.4258 195.871 64.5664C195.863 64.7031 195.811 64.8184 195.713 64.9121C195.615 65.0059 195.498 65.0527 195.361 65.0527C195.221 65.0566 195.105 65.0137 195.016 64.9238C194.926 64.8301 194.887 64.7129 194.898 64.5723ZM194.898 78.5723C194.902 78.4395 194.953 78.3242 195.051 78.2266C195.148 78.1289 195.266 78.0781 195.402 78.0742C195.543 78.0742 195.658 78.1211 195.748 78.2148C195.838 78.3086 195.879 78.4258 195.871 78.5664C195.863 78.7031 195.811 78.8184 195.713 78.9121C195.615 79.0059 195.498 79.0527 195.361 79.0527C195.221 79.0566 195.105 79.0137 195.016 78.9238C194.926 78.8301 194.887 78.7129 194.898 78.5723ZM194.746 86.6543L196 89.2148L198.062 86.666H198.918L196.322 89.8066L197.98 93H197.219L195.883 90.3281L193.721 93H192.871L195.572 89.7305L193.984 86.6543H194.746Z" fill="black"/>
<path d="M203.047 19.2891L202.074 25H201.617L202.504 19.8945L200.906 20.457L200.984 20.0039L202.961 19.2891H203.047Z" fill="black"/>
<path d="M203.523 38.5977L203.461 39H200.004L200.059 38.6211L202.176 36.5234C202.332 36.3672 202.496 36.1992 202.668 36.0195C202.842 35.8398 202.995 35.6471 203.125 35.4414C203.258 35.2357 203.342 35.0169 203.379 34.7852C203.41 34.5638 203.392 34.3672 203.324 34.1953C203.259 34.0234 203.15 33.888 202.996 33.7891C202.842 33.6875 202.651 33.6354 202.422 33.6328C202.161 33.6302 201.93 33.6875 201.727 33.8047C201.526 33.9219 201.361 34.0807 201.23 34.2812C201.103 34.4818 201.018 34.7044 200.977 34.9492H200.523C200.568 34.6237 200.677 34.3307 200.852 34.0703C201.026 33.8099 201.249 33.6055 201.52 33.457C201.793 33.306 202.098 33.2318 202.434 33.2344C202.736 33.237 202.999 33.2995 203.223 33.4219C203.449 33.5443 203.618 33.7188 203.73 33.9453C203.842 34.1719 203.88 34.4388 203.844 34.7461C203.82 34.9518 203.762 35.1497 203.668 35.3398C203.574 35.5273 203.46 35.7083 203.324 35.8828C203.191 36.0547 203.049 36.2188 202.898 36.375C202.747 36.5286 202.602 36.6745 202.461 36.8125L200.645 38.5977H203.523Z" fill="black"/>
<path d="M201.082 94.6953L200.512 98H200.055L200.785 93.7734H201.223L201.082 94.6953ZM200.828 95.625L200.645 95.5078C200.697 95.2786 200.776 95.056 200.883 94.8398C200.99 94.6211 201.122 94.4258 201.281 94.2539C201.443 94.0794 201.628 93.9427 201.836 93.8438C202.047 93.7422 202.281 93.6927 202.539 93.6953C202.771 93.6979 202.962 93.7409 203.113 93.8242C203.267 93.9049 203.385 94.0182 203.469 94.1641C203.552 94.3073 203.604 94.4727 203.625 94.6602C203.648 94.8451 203.646 95.0417 203.617 95.25L203.152 98H202.691L203.164 95.2422C203.193 95.0391 203.191 94.8516 203.16 94.6797C203.132 94.5052 203.059 94.3659 202.941 94.2617C202.824 94.1549 202.648 94.1016 202.414 94.1016C202.206 94.099 202.014 94.1419 201.84 94.2305C201.665 94.3164 201.509 94.4336 201.371 94.582C201.236 94.7279 201.121 94.8919 201.027 95.0742C200.936 95.2539 200.87 95.4375 200.828 95.625Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 11 KiB

@@ -0,0 +1,17 @@
<svg width="228" height="113" viewBox="0 0 228 113" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M51 13H42V103H51" stroke="black" stroke-width="2"/>
<path d="M66 13H75V103H66" stroke="black" stroke-width="2"/>
<path d="M56.7461 16.6543L58 19.2148L60.0625 16.666H60.918L58.3223 19.8066L59.9805 23H59.2188L57.8828 20.3281L55.7207 23H54.8711L57.5723 19.7305L55.9844 16.6543H56.7461ZM56.7461 30.6543L58 33.2148L60.0625 30.666H60.918L58.3223 33.8066L59.9805 37H59.2188L57.8828 34.3281L55.7207 37H54.8711L57.5723 33.7305L55.9844 30.6543H56.7461ZM56.8984 50.5723C56.9023 50.4395 56.9531 50.3242 57.0508 50.2266C57.1484 50.1289 57.2656 50.0781 57.4023 50.0742C57.543 50.0742 57.6582 50.1211 57.748 50.2148C57.8379 50.3086 57.8789 50.4258 57.8711 50.5664C57.8633 50.7031 57.8105 50.8184 57.7129 50.9121C57.6152 51.0059 57.498 51.0527 57.3613 51.0527C57.2207 51.0566 57.1055 51.0137 57.0156 50.9238C56.9258 50.8301 56.8867 50.7129 56.8984 50.5723ZM56.8984 64.5723C56.9023 64.4395 56.9531 64.3242 57.0508 64.2266C57.1484 64.1289 57.2656 64.0781 57.4023 64.0742C57.543 64.0742 57.6582 64.1211 57.748 64.2148C57.8379 64.3086 57.8789 64.4258 57.8711 64.5664C57.8633 64.7031 57.8105 64.8184 57.7129 64.9121C57.6152 65.0059 57.498 65.0527 57.3613 65.0527C57.2207 65.0566 57.1055 65.0137 57.0156 64.9238C56.9258 64.8301 56.8867 64.7129 56.8984 64.5723ZM56.8984 78.5723C56.9023 78.4395 56.9531 78.3242 57.0508 78.2266C57.1484 78.1289 57.2656 78.0781 57.4023 78.0742C57.543 78.0742 57.6582 78.1211 57.748 78.2148C57.8379 78.3086 57.8789 78.4258 57.8711 78.5664C57.8633 78.7031 57.8105 78.8184 57.7129 78.9121C57.6152 79.0059 57.498 79.0527 57.3613 79.0527C57.2207 79.0566 57.1055 79.0137 57.0156 78.9238C56.9258 78.8301 56.8867 78.7129 56.8984 78.5723ZM56.7461 86.6543L58 89.2148L60.0625 86.666H60.918L58.3223 89.8066L59.9805 93H59.2188L57.8828 90.3281L55.7207 93H54.8711L57.5723 89.7305L55.9844 86.6543H56.7461Z" fill="black"/>
<path d="M65.0469 19.2891L64.0742 25H63.6172L64.5039 19.8945L62.9062 20.457L62.9844 20.0039L64.9609 19.2891H65.0469Z" fill="black"/>
<path d="M65.5234 38.5977L65.4609 39H62.0039L62.0586 38.6211L64.1758 36.5234C64.332 36.3672 64.4961 36.1992 64.668 36.0195C64.8424 35.8398 64.9948 35.6471 65.125 35.4414C65.2578 35.2357 65.3424 35.0169 65.3789 34.7852C65.4102 34.5638 65.3919 34.3672 65.3242 34.1953C65.2591 34.0234 65.1497 33.888 64.9961 33.7891C64.8424 33.6875 64.651 33.6354 64.4219 33.6328C64.1615 33.6302 63.9297 33.6875 63.7266 33.8047C63.526 33.9219 63.3607 34.0807 63.2305 34.2812C63.1029 34.4818 63.0182 34.7044 62.9766 34.9492H62.5234C62.5677 34.6237 62.6771 34.3307 62.8516 34.0703C63.026 33.8099 63.2487 33.6055 63.5195 33.457C63.793 33.306 64.0977 33.2318 64.4336 33.2344C64.7357 33.237 64.9987 33.2995 65.2227 33.4219C65.4492 33.5443 65.6185 33.7188 65.7305 33.9453C65.8424 34.1719 65.8802 34.4388 65.8438 34.7461C65.8203 34.9518 65.7617 35.1497 65.668 35.3398C65.5742 35.5273 65.4596 35.7083 65.3242 35.8828C65.1914 36.0547 65.0495 36.2188 64.8984 36.375C64.7474 36.5286 64.6016 36.6745 64.4609 36.8125L62.6445 38.5977H65.5234Z" fill="black"/>
<path d="M63.082 94.6953L62.5117 98H62.0547L62.7852 93.7734H63.2227L63.082 94.6953ZM62.8281 95.625L62.6445 95.5078C62.6966 95.2786 62.776 95.056 62.8828 94.8398C62.9896 94.6211 63.1224 94.4258 63.2812 94.2539C63.4427 94.0794 63.6276 93.9427 63.8359 93.8438C64.0469 93.7422 64.2812 93.6927 64.5391 93.6953C64.7708 93.6979 64.9622 93.7409 65.1133 93.8242C65.2669 93.9049 65.3854 94.0182 65.4688 94.1641C65.5521 94.3073 65.6042 94.4727 65.625 94.6602C65.6484 94.8451 65.6458 95.0417 65.6172 95.25L65.1523 98H64.6914L65.1641 95.2422C65.1927 95.0391 65.1914 94.8516 65.1602 94.6797C65.1315 94.5052 65.0586 94.3659 64.9414 94.2617C64.8242 94.1549 64.6484 94.1016 64.4141 94.1016C64.2057 94.099 64.0143 94.1419 63.8398 94.2305C63.6654 94.3164 63.5091 94.4336 63.3711 94.582C63.2357 94.7279 63.1211 94.8919 63.0273 95.0742C62.9362 95.2539 62.8698 95.4375 62.8281 95.625Z" fill="black"/>
<path d="M161 13H152V103H161" stroke="black" stroke-width="2"/>
<path d="M176 13H185V103H176" stroke="black" stroke-width="2"/>
<path d="M166.746 24.6543L168 27.2148L170.062 24.666H170.918L168.322 27.8066L169.98 31H169.219L167.883 28.3281L165.721 31H164.871L167.572 27.7305L165.984 24.6543H166.746ZM166.746 38.6543L168 41.2148L170.062 38.666H170.918L168.322 41.8066L169.98 45H169.219L167.883 42.3281L165.721 45H164.871L167.572 41.7305L165.984 38.6543H166.746ZM166.746 52.6543L168 55.2148L170.062 52.666H170.918L168.322 55.8066L169.98 59H169.219L167.883 56.3281L165.721 59H164.871L167.572 55.7305L165.984 52.6543H166.746ZM166.746 66.6543L168 69.2148L170.062 66.666H170.918L168.322 69.8066L169.98 73H169.219L167.883 70.3281L165.721 73H164.871L167.572 69.7305L165.984 66.6543H166.746ZM166.746 80.6543L168 83.2148L170.062 80.666H170.918L168.322 83.8066L169.98 87H169.219L167.883 84.3281L165.721 87H164.871L167.572 83.7305L165.984 80.6543H166.746Z" fill="black"/>
<path d="M173.785 28.7168L173.056 33H172.713L173.378 29.1709L172.18 29.5928L172.238 29.2529L173.721 28.7168H173.785Z" fill="black"/>
<path d="M174.143 46.6982L174.096 47H171.503L171.544 46.7158L173.132 45.1426C173.249 45.0254 173.372 44.8994 173.501 44.7646C173.632 44.6299 173.746 44.4854 173.844 44.3311C173.943 44.1768 174.007 44.0127 174.034 43.8389C174.058 43.6729 174.044 43.5254 173.993 43.3965C173.944 43.2676 173.862 43.166 173.747 43.0918C173.632 43.0156 173.488 42.9766 173.316 42.9746C173.121 42.9727 172.947 43.0156 172.795 43.1035C172.645 43.1914 172.521 43.3105 172.423 43.4609C172.327 43.6113 172.264 43.7783 172.232 43.9619H171.893C171.926 43.7178 172.008 43.498 172.139 43.3027C172.27 43.1074 172.437 42.9541 172.64 42.8428C172.845 42.7295 173.073 42.6738 173.325 42.6758C173.552 42.6777 173.749 42.7246 173.917 42.8164C174.087 42.9082 174.214 43.0391 174.298 43.209C174.382 43.3789 174.41 43.5791 174.383 43.8096C174.365 43.9639 174.321 44.1123 174.251 44.2549C174.181 44.3955 174.095 44.5312 173.993 44.6621C173.894 44.791 173.787 44.9141 173.674 45.0312C173.561 45.1465 173.451 45.2559 173.346 45.3594L171.983 46.6982H174.143Z" fill="black"/>
<path d="M172.622 58.6738L172.953 58.6768C173.127 58.6729 173.293 58.6396 173.451 58.5771C173.611 58.5146 173.746 58.4219 173.855 58.2988C173.967 58.1758 174.035 58.0215 174.061 57.8359C174.086 57.666 174.073 57.5176 174.022 57.3906C173.972 57.2617 173.889 57.1611 173.773 57.0889C173.658 57.0146 173.514 56.9766 173.34 56.9746C173.16 56.9727 172.997 57.0088 172.851 57.083C172.704 57.1572 172.582 57.2607 172.484 57.3936C172.389 57.5244 172.324 57.6768 172.291 57.8506H171.951C171.984 57.6182 172.066 57.4131 172.197 57.2354C172.33 57.0576 172.497 56.9199 172.698 56.8223C172.899 56.7227 173.117 56.6738 173.352 56.6758C173.582 56.6758 173.781 56.7256 173.949 56.8252C174.117 56.9229 174.242 57.0596 174.324 57.2354C174.406 57.4111 174.434 57.6152 174.406 57.8477C174.387 58.0215 174.332 58.1748 174.242 58.3076C174.154 58.4385 174.043 58.5488 173.908 58.6387C173.775 58.7266 173.63 58.7939 173.472 58.8408C173.313 58.8857 173.154 58.9092 172.994 58.9111L172.587 58.9082L172.622 58.6738ZM172.575 58.9756L172.61 58.7441H172.977C173.146 58.748 173.307 58.7715 173.457 58.8145C173.609 58.8574 173.742 58.9229 173.855 59.0107C173.971 59.0967 174.057 59.208 174.113 59.3447C174.172 59.4795 174.19 59.6416 174.169 59.8311C174.147 60.0186 174.096 60.1885 174.014 60.3408C173.934 60.4912 173.829 60.6201 173.7 60.7275C173.571 60.835 173.425 60.918 173.261 60.9766C173.097 61.0332 172.922 61.0605 172.736 61.0586C172.557 61.0566 172.393 61.0264 172.244 60.9678C172.096 60.9092 171.968 60.8271 171.86 60.7217C171.755 60.6143 171.676 60.4863 171.623 60.3379C171.572 60.1875 171.555 60.0215 171.57 59.8398L171.91 59.8428C171.893 60.0225 171.916 60.1807 171.98 60.3174C172.047 60.4541 172.146 60.5615 172.276 60.6396C172.409 60.7158 172.565 60.7549 172.745 60.7568C172.937 60.7588 173.108 60.7227 173.261 60.6484C173.415 60.5742 173.541 60.4688 173.639 60.332C173.738 60.1934 173.801 60.0293 173.826 59.8398C173.854 59.6406 173.83 59.4785 173.756 59.3535C173.682 59.2266 173.572 59.1328 173.428 59.0723C173.285 59.0117 173.123 58.9805 172.941 58.9785L172.575 58.9756Z" fill="black"/>
<path d="M174.444 73.623L174.397 73.9219H171.459L171.497 73.7051L173.914 70.7373H174.222L173.686 71.4727L171.945 73.623H174.444ZM174.298 70.7344L173.562 75H173.22L173.958 70.7344H174.298Z" fill="black"/>
<path d="M172.35 86.8877L172.074 86.8057L172.599 84.7344H174.67L174.623 85.0625H172.848L172.458 86.501C172.575 86.4229 172.703 86.3643 172.842 86.3252C172.98 86.2842 173.12 86.2646 173.261 86.2666C173.454 86.2666 173.621 86.3047 173.762 86.3809C173.902 86.4551 174.016 86.5566 174.102 86.6855C174.189 86.8125 174.249 86.958 174.28 87.1221C174.313 87.2842 174.32 87.4541 174.301 87.6318C174.277 87.8271 174.229 88.0117 174.157 88.1855C174.085 88.3594 173.988 88.5127 173.867 88.6455C173.746 88.7764 173.602 88.8789 173.434 88.9531C173.268 89.0273 173.079 89.0625 172.868 89.0586C172.69 89.0586 172.532 89.0293 172.394 88.9707C172.257 88.9121 172.142 88.8301 172.048 88.7246C171.954 88.6172 171.883 88.4922 171.834 88.3496C171.785 88.2051 171.761 88.0479 171.761 87.8779H172.089C172.089 88.0479 172.117 88.1992 172.174 88.332C172.23 88.4629 172.316 88.5664 172.432 88.6426C172.549 88.7188 172.698 88.7578 172.88 88.7598C173.044 88.7598 173.188 88.7295 173.311 88.6689C173.436 88.6084 173.542 88.5254 173.63 88.4199C173.72 88.3145 173.791 88.1943 173.844 88.0596C173.896 87.9248 173.934 87.7832 173.955 87.6348C173.973 87.5 173.971 87.3711 173.949 87.248C173.928 87.123 173.886 87.0117 173.823 86.9141C173.761 86.8145 173.677 86.7363 173.571 86.6797C173.466 86.6211 173.339 86.5898 173.19 86.5859C173.028 86.584 172.879 86.6094 172.742 86.6621C172.607 86.7148 172.477 86.79 172.35 86.8877Z" fill="black"/>
<path d="M134.061 62.0607C134.646 61.4749 134.646 60.5251 134.061 59.9393L124.515 50.3934C123.929 49.8076 122.979 49.8076 122.393 50.3934C121.808 50.9792 121.808 51.9289 122.393 52.5147L130.879 61L122.393 69.4853C121.808 70.0711 121.808 71.0208 122.393 71.6066C122.979 72.1924 123.929 72.1924 124.515 71.6066L134.061 62.0607ZM91 62.5H133V59.5H91V62.5Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 9.9 KiB

@@ -0,0 +1,14 @@
<svg width="228" height="113" viewBox="0 0 228 113" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="32" y="12" width="12" height="12" fill="black"/>
<rect x="72" y="9" width="12" height="12" fill="black"/>
<rect x="60" y="32" width="12" height="12" fill="black"/>
<rect x="32" y="44" width="12" height="12" fill="black"/>
<circle cx="166" cy="53" r="6" fill="black"/>
<circle cx="180" cy="19" r="6" fill="black"/>
<circle cx="194" cy="44" r="6" fill="black"/>
<circle cx="154" cy="32" r="6" fill="black"/>
<path d="M90 98L95.1962 107H84.8038L90 98Z" fill="black"/>
<path d="M104 80L109.196 89H98.8038L104 80Z" fill="black"/>
<path d="M121 98L126.196 107H115.804L121 98Z" fill="black"/>
<path d="M127 74L132.196 83H121.804L127 74Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 762 B

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 42 KiB

@@ -0,0 +1,23 @@
<svg width="228" height="113" viewBox="0 0 228 113" fill="none" xmlns="http://www.w3.org/2000/svg">
<line x1="59.8941" y1="40.3059" x2="59.8941" y2="62.85" stroke="black"/>
<line x1="57.9618" y1="40.3059" x2="57.9618" y2="62.85" stroke="black"/>
<line x1="99.1853" y1="40.6618" x2="99.1853" y2="63.2059" stroke="black"/>
<line x1="97.2529" y1="40.6618" x2="97.2529" y2="63.2059" stroke="black"/>
<path d="M51.3695 48.5401V49.794H41.5961V48.5401H51.3695ZM51.3695 53.3565V54.6104H41.5961V53.3565H51.3695Z" fill="#ABA9A9"/>
<path d="M107.229 46.0497L110.651 51.1708L114.084 46.0497H115.748L111.448 52.2606L115.924 58.7294H114.284L110.662 53.3739L107.053 58.7294H105.412L109.889 52.2606L105.588 46.0497H107.229Z" fill="#ABA9A9"/>
<path d="M187.412 51.0458V52.37H175.717V51.0458H187.412ZM182.221 45.5966V58.0184H180.815V45.5966H182.221Z" fill="#ABA9A9"/>
<path d="M126.172 42.3736V60.3736H122.785V42.3736H126.172ZM128.422 54.1627V53.9166C128.422 52.9869 128.555 52.1314 128.82 51.3502C129.086 50.5611 129.473 49.8775 129.981 49.2994C130.488 48.7213 131.113 48.272 131.856 47.9517C132.598 47.6236 133.449 47.4595 134.41 47.4595C135.371 47.4595 136.227 47.6236 136.977 47.9517C137.727 48.272 138.356 48.7213 138.863 49.2994C139.379 49.8775 139.77 50.5611 140.035 51.3502C140.301 52.1314 140.434 52.9869 140.434 53.9166V54.1627C140.434 55.0845 140.301 55.94 140.035 56.7291C139.77 57.5103 139.379 58.1939 138.863 58.7798C138.356 59.358 137.731 59.8072 136.988 60.1275C136.246 60.4478 135.395 60.608 134.434 60.608C133.473 60.608 132.617 60.4478 131.867 60.1275C131.125 59.8072 130.496 59.358 129.981 58.7798C129.473 58.1939 129.086 57.5103 128.82 56.7291C128.555 55.94 128.422 55.0845 128.422 54.1627ZM131.797 53.9166V54.1627C131.797 54.6939 131.844 55.19 131.938 55.6509C132.031 56.1119 132.18 56.5181 132.383 56.8697C132.594 57.2134 132.867 57.483 133.203 57.6783C133.539 57.8736 133.949 57.9713 134.434 57.9713C134.903 57.9713 135.305 57.8736 135.641 57.6783C135.977 57.483 136.246 57.2134 136.449 56.8697C136.653 56.5181 136.801 56.1119 136.895 55.6509C136.996 55.19 137.047 54.6939 137.047 54.1627V53.9166C137.047 53.4009 136.996 52.9166 136.895 52.4634C136.801 52.0025 136.649 51.5963 136.438 51.2447C136.235 50.8853 135.965 50.6041 135.629 50.4009C135.293 50.1978 134.887 50.0963 134.41 50.0963C133.934 50.0963 133.528 50.1978 133.192 50.4009C132.863 50.6041 132.594 50.8853 132.383 51.2447C132.18 51.5963 132.031 52.0025 131.938 52.4634C131.844 52.9166 131.797 53.4009 131.797 53.9166ZM150.535 47.6939H153.594V59.9517C153.594 61.108 153.336 62.0884 152.82 62.8931C152.313 63.7056 151.602 64.3189 150.688 64.733C149.774 65.1548 148.711 65.3658 147.5 65.3658C146.969 65.3658 146.406 65.2955 145.813 65.1548C145.227 65.0142 144.664 64.7955 144.125 64.4986C143.594 64.2017 143.149 63.8267 142.789 63.3736L144.278 61.3814C144.668 61.8345 145.121 62.1861 145.637 62.4361C146.153 62.6939 146.723 62.8228 147.348 62.8228C147.957 62.8228 148.473 62.7095 148.895 62.483C149.317 62.2642 149.641 61.94 149.867 61.5103C150.094 61.0884 150.207 60.5767 150.207 59.9752V50.6236L150.535 47.6939ZM142.004 54.1861V53.94C142.004 52.9713 142.121 52.0923 142.356 51.3033C142.598 50.5064 142.938 49.8228 143.375 49.2525C143.82 48.6822 144.36 48.2408 144.992 47.9283C145.625 47.6158 146.34 47.4595 147.137 47.4595C147.981 47.4595 148.688 47.6158 149.258 47.9283C149.828 48.2408 150.297 48.6861 150.664 49.2642C151.031 49.8345 151.317 50.5103 151.52 51.2916C151.731 52.065 151.895 52.9127 152.012 53.8345V54.3736C151.895 55.2564 151.719 56.0767 151.485 56.8345C151.25 57.5923 150.942 58.2564 150.559 58.8267C150.176 59.3892 149.699 59.8267 149.129 60.1392C148.567 60.4517 147.895 60.608 147.113 60.608C146.332 60.608 145.625 60.4478 144.992 60.1275C144.367 59.8072 143.832 59.358 143.387 58.7798C142.942 58.2017 142.598 57.522 142.356 56.7408C142.121 55.9595 142.004 55.108 142.004 54.1861ZM145.379 53.94V54.1861C145.379 54.7095 145.43 55.1978 145.531 55.6509C145.633 56.1041 145.789 56.5064 146 56.858C146.219 57.2017 146.488 57.4713 146.809 57.6666C147.137 57.8541 147.524 57.9478 147.969 57.9478C148.586 57.9478 149.09 57.8189 149.481 57.5611C149.871 57.2955 150.164 56.9322 150.36 56.4713C150.555 56.0103 150.668 55.4791 150.699 54.8775V53.3423C150.684 52.8502 150.617 52.4088 150.5 52.0181C150.383 51.6197 150.219 51.2798 150.008 50.9986C149.797 50.7173 149.524 50.4986 149.188 50.3423C148.852 50.1861 148.453 50.108 147.992 50.108C147.547 50.108 147.16 50.2095 146.832 50.4127C146.512 50.608 146.242 50.8775 146.024 51.2213C145.813 51.565 145.653 51.9713 145.543 52.44C145.434 52.9009 145.379 53.4009 145.379 53.94Z" fill="black"/>
<path d="M157.212 52.8801V52.6603C157.212 50.756 157.413 48.9738 157.813 47.3137C158.213 45.6535 158.736 44.1594 159.38 42.8312C160.035 41.5031 160.748 40.3752 161.519 39.4474C162.3 38.5099 163.067 37.8166 163.819 37.3674L164.244 38.5685C163.599 39.0275 162.96 39.6916 162.325 40.5607C161.7 41.4299 161.133 42.4748 160.626 43.6955C160.118 44.9162 159.712 46.2785 159.41 47.7824C159.107 49.2863 158.956 50.8976 158.956 52.6164V52.9094C158.956 54.6281 159.107 56.2394 159.41 57.7433C159.712 59.2473 160.118 60.6096 160.626 61.8303C161.133 63.0607 161.7 64.1154 162.325 64.9943C162.96 65.883 163.599 66.5666 164.244 67.0451L163.819 68.173C163.067 67.7238 162.3 67.0402 161.519 66.1223C160.748 65.2141 160.035 64.1008 159.38 62.7824C158.736 61.4738 158.213 59.9846 157.813 58.3146C157.413 56.6447 157.212 54.8332 157.212 52.8801Z" fill="#ABA9A9"/>
<path d="M221.935 53.2359V53.0162C221.935 51.1119 221.734 49.3297 221.334 47.6695C220.934 46.0093 220.411 44.5152 219.767 43.1871C219.112 41.8589 218.399 40.731 217.628 39.8033C216.847 38.8658 216.08 38.1724 215.328 37.7232L214.903 38.9244C215.548 39.3834 216.188 40.0474 216.822 40.9166C217.447 41.7857 218.014 42.8306 218.521 44.0513C219.029 45.272 219.435 46.6343 219.737 48.1382C220.04 49.6422 220.191 51.2535 220.191 52.9722V53.2652C220.191 54.9839 220.04 56.5953 219.737 58.0992C219.435 59.6031 219.029 60.9654 218.521 62.1861C218.014 63.4166 217.447 64.4713 216.822 65.3502C216.188 66.2388 215.548 66.9224 214.903 67.4009L215.328 68.5289C216.08 68.0797 216.847 67.3961 217.628 66.4781C218.399 65.5699 219.112 64.4566 219.767 63.1382C220.411 61.8297 220.934 60.3404 221.334 58.6705C221.734 57.0005 221.935 55.189 221.935 53.2359Z" fill="#ABA9A9"/>
<path d="M200.208 29.5273L195.345 44H191.935L198.31 26.9375H200.489L200.208 29.5273ZM204.275 44L199.388 29.5273L199.095 26.9375H201.286L207.696 44H204.275ZM204.052 37.6602V40.2031H194.9V37.6602H204.052Z" fill="black"/>
<path d="M171.364 43.9083V61.0177H168.258V47.5294L164.145 48.8888V46.381L171.012 43.9083H171.364Z" fill="black"/>
<line x1="187.929" y1="46.6207" x2="211.762" y2="46.6207" stroke="#ABA9A9"/>
<path d="M201.288 69.1207H198.17V55.2691C198.17 54.316 198.354 53.5152 198.721 52.8668C199.088 52.2105 199.612 51.7144 200.291 51.3785C200.971 51.0425 201.772 50.8746 202.694 50.8746C202.998 50.8746 203.288 50.8941 203.561 50.9332C203.842 50.9722 204.12 51.0269 204.393 51.0972L204.334 53.4527C204.186 53.4136 204.022 53.3863 203.842 53.3707C203.67 53.355 203.479 53.3472 203.268 53.3472C202.846 53.3472 202.487 53.4214 202.19 53.5699C201.893 53.7183 201.666 53.9371 201.51 54.2261C201.362 54.5074 201.288 54.855 201.288 55.2691V69.1207ZM203.854 56.441V58.6675H196.26V56.441H203.854Z" fill="black"/>
<path d="M204.029 65.9971L204.682 67.5322L205.83 65.9971H206.701L205.006 68.1182L206.037 70.2236H205.268L204.568 68.6416L203.377 70.2236H202.514L204.256 68.0479L203.26 65.9971H204.029Z" fill="#0277BD"/>
<path d="M72.0705 47.6938V50.0845H64.6877V47.6938H72.0705ZM66.5158 44.5649H69.8908V56.5532C69.8908 56.9204 69.9377 57.2017 70.0314 57.397C70.133 57.5923 70.2814 57.729 70.4767 57.8071C70.6721 57.8774 70.9182 57.9126 71.215 57.9126C71.426 57.9126 71.6135 57.9048 71.7775 57.8892C71.9494 57.8657 72.0939 57.8423 72.2111 57.8188L72.2228 60.3032C71.9338 60.397 71.6213 60.4712 71.2853 60.5259C70.9494 60.5806 70.5783 60.6079 70.1721 60.6079C69.4299 60.6079 68.7814 60.4868 68.2267 60.2446C67.6799 59.9946 67.258 59.5962 66.9611 59.0493C66.6642 58.5024 66.5158 57.7837 66.5158 56.8931V44.5649ZM78.2932 60.3735H74.8947V46.5688C74.8947 45.6079 75.0822 44.7993 75.4572 44.1431C75.84 43.479 76.3752 42.979 77.0627 42.6431C77.758 42.2993 78.5822 42.1274 79.5353 42.1274C79.8478 42.1274 80.1486 42.1509 80.4377 42.1978C80.7267 42.2368 81.008 42.2876 81.2814 42.3501L81.2463 44.8931C81.0978 44.854 80.9416 44.8267 80.7775 44.811C80.6135 44.7954 80.4221 44.7876 80.2033 44.7876C79.7971 44.7876 79.4494 44.8579 79.1603 44.9985C78.8791 45.1313 78.6642 45.3306 78.5158 45.5962C78.3674 45.8618 78.2932 46.186 78.2932 46.5688V60.3735ZM80.8244 47.6938V50.0845H73.008V47.6938H80.8244Z" fill="black"/>
<path d="M82.291 59.894L82.9434 61.4291L84.0918 59.894H84.9629L83.2676 62.0151L84.2989 64.1205H83.5293L82.8301 62.5385L81.6387 64.1205H80.7754L82.5176 61.9448L81.5215 59.894H82.291ZM90.2442 63.6088C90.416 63.6114 90.5762 63.5789 90.7246 63.5112C90.8731 63.4435 90.9994 63.3471 91.1035 63.2221C91.2077 63.0971 91.2819 62.9526 91.3262 62.7885L91.9981 62.7846C91.9564 63.0685 91.8457 63.3172 91.666 63.5307C91.489 63.7442 91.2715 63.9109 91.0137 64.0307C90.7585 64.1479 90.4916 64.2039 90.2129 64.1987C89.916 64.1935 89.6634 64.1323 89.4551 64.0151C89.2494 63.8953 89.084 63.7364 88.959 63.5385C88.834 63.3406 88.7481 63.1179 88.7012 62.8705C88.6543 62.6205 88.6439 62.364 88.67 62.101L88.6856 61.933C88.7168 61.6492 88.7858 61.3797 88.8926 61.1245C88.9994 60.8666 89.1413 60.6388 89.3184 60.4409C89.4981 60.2403 89.7103 60.0841 89.9551 59.9721C90.1999 59.8601 90.4746 59.808 90.7793 59.8159C91.0762 59.8211 91.334 59.8914 91.5528 60.0268C91.7715 60.1596 91.9408 60.3406 92.0606 60.5698C92.1804 60.7989 92.2403 61.0593 92.2403 61.351L91.5762 61.3471C91.5736 61.1804 91.541 61.0268 91.4785 60.8862C91.416 60.7455 91.3236 60.6323 91.2012 60.5463C91.0788 60.4604 90.9278 60.4135 90.7481 60.4057C90.5319 60.4005 90.3431 60.4409 90.1817 60.5268C90.0228 60.6127 89.8874 60.7312 89.7754 60.8823C89.666 61.0307 89.5788 61.1961 89.5137 61.3784C89.4512 61.5606 89.4082 61.7455 89.3848 61.933L89.3653 62.0971C89.3496 62.2638 89.347 62.4343 89.3575 62.6088C89.3705 62.7833 89.4069 62.9461 89.4668 63.0971C89.5267 63.2455 89.6192 63.3666 89.7442 63.4604C89.8692 63.5541 90.0358 63.6036 90.2442 63.6088Z" fill="#0277BD"/>
<path d="M85.7754 63.2612L85.6817 63.8393C85.6374 64.1231 85.5371 64.3875 85.3809 64.6323C85.2246 64.8771 85.0332 65.0854 84.8067 65.2573L84.416 64.9643C84.5072 64.8523 84.5905 64.7377 84.666 64.6205C84.7416 64.506 84.8054 64.3849 84.8575 64.2573C84.9121 64.1297 84.9538 63.9955 84.9825 63.8549L85.084 63.2612H85.7754Z" fill="#ABA9A9"/>
<path d="M13.8242 58.3923L17.2227 44.5993H19.0625L19.1797 47.5056L15.5469 61.6618H13.6016L13.8242 58.3923ZM11.6797 44.5993L14.4688 58.3454V61.6618H12.3477L8.48047 44.5993H11.6797ZM22.707 58.2868L25.4492 44.5993H28.6602L24.793 61.6618H22.6719L22.707 58.2868ZM19.9414 44.5993L23.3398 58.4391L23.5391 61.6618H21.5938L17.9727 47.4938L18.1133 44.5993H19.9414Z" fill="black"/>
<path d="M29.2528 60.5382L29.9052 62.0734L31.0536 60.5382H31.9247L30.2294 62.6593L31.2606 64.7648H30.4911L29.7919 63.1827L28.6005 64.7648H27.7372L29.4794 62.589L28.4833 60.5382H29.2528ZM37.2059 64.2531C37.3778 64.2557 37.538 64.2231 37.6864 64.1554C37.8348 64.0877 37.9611 63.9913 38.0653 63.8663C38.1695 63.7413 38.2437 63.5968 38.288 63.4327L38.9598 63.4288C38.9182 63.7127 38.8075 63.9614 38.6278 64.1749C38.4507 64.3885 38.2333 64.5551 37.9755 64.6749C37.7203 64.7921 37.4533 64.8481 37.1747 64.8429C36.8778 64.8377 36.6252 64.7765 36.4169 64.6593C36.2111 64.5395 36.0458 64.3807 35.9208 64.1827C35.7958 63.9848 35.7098 63.7622 35.663 63.5148C35.6161 63.2648 35.6057 63.0083 35.6317 62.7452L35.6473 62.5773C35.6786 62.2934 35.7476 62.0239 35.8544 61.7687C35.9611 61.5109 36.1031 61.283 36.2802 61.0851C36.4598 60.8846 36.6721 60.7283 36.9169 60.6163C37.1617 60.5044 37.4364 60.4523 37.7411 60.4601C38.038 60.4653 38.2958 60.5356 38.5145 60.671C38.7333 60.8038 38.9025 60.9848 39.0223 61.214C39.1421 61.4432 39.202 61.7036 39.202 61.9952L38.538 61.9913C38.5354 61.8247 38.5028 61.671 38.4403 61.5304C38.3778 61.3898 38.2854 61.2765 38.163 61.1906C38.0406 61.1046 37.8895 61.0577 37.7098 61.0499C37.4937 61.0447 37.3049 61.0851 37.1434 61.171C36.9846 61.257 36.8492 61.3754 36.7372 61.5265C36.6278 61.6749 36.5406 61.8403 36.4755 62.0226C36.413 62.2049 36.37 62.3898 36.3466 62.5773L36.327 62.7413C36.3114 62.908 36.3088 63.0786 36.3192 63.2531C36.3322 63.4275 36.3687 63.5903 36.4286 63.7413C36.4885 63.8898 36.5809 64.0109 36.7059 64.1046C36.8309 64.1984 36.9976 64.2478 37.2059 64.2531Z" fill="#0277BD"/>
<path d="M32.7372 63.9054L32.6434 64.4835C32.5992 64.7674 32.4989 65.0317 32.3427 65.2765C32.1864 65.5213 31.995 65.7296 31.7684 65.9015L31.3778 65.6085C31.469 65.4965 31.5523 65.382 31.6278 65.2648C31.7033 65.1502 31.7671 65.0291 31.8192 64.9015C31.8739 64.7739 31.9156 64.6398 31.9442 64.4991L32.0458 63.9054H32.7372Z" fill="#ABA9A9"/>
</svg>

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 55 KiB

@@ -0,0 +1,175 @@
---
hide:
- navigation
---
# The Algorithm
Below, you will find different types of overviews of each step in BERTopic's main algorithm. Each successive overview will be more in-depth than the previous overview. This approach aims to make the underlying algorithm as intuitive as possible for a wide range of users.
## **Visual Overview**
BERTopic can be viewed as a sequence of steps to create its topic representations. There are five steps to this process:
<img src="default.svg">
Although these steps are the default, there is some modularity to BERTopic. Each step in this process was carefully selected such that they are all somewhat independent from one another. For example, the tokenization step is not directly influenced by the embedding model that was used to convert the documents which allow us to be creative in how we perform the tokenization step.
This effect is especially strong in the clustering step. Models like HDBSCAN assume that clusters can have different shapes and forms. As a result, using a centroid-based technique to model the topic representations would not be beneficial since the centroid is not always representative of these types of clusters. A bag-of-words representation, however, makes very few assumptions concerning the shape and form of a cluster.
As a result, BERTopic is quite modular and can maintain its quality of topic generation throughout a variety of sub-models. In other words, BERTopic essentially allows you to **build your own topic model**:
<img src="modularity.svg">
There is extensive documentation on how to use each step in this pipeline:
1. [Embeddings](../getting_started/embeddings/embeddings.html)
2. [Dimensionality Reduction](../getting_started/dim_reduction/dim_reduction.html)
3. [Clustering](../getting_started/clustering/clustering.html)
4. [Tokenizer](../getting_started/vectorizers/vectorizers.html)
5. [Weighting Scheme](../getting_started/ctfidf/ctfidf.html)
6. [Representation Tuning](../getting_started/representation/representation.html)
* [Large Language Models (LLM)](../getting_started/representation/llm.html)
## **Code Overview**
After going through the visual overview, this code overview demonstrates the algorithm using BERTopic. An advantage of using BERTopic is each major step in its algorithm can be explicitly defined, thereby making the process not only transparent but also more intuitive.
```python
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")
# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()
# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# All steps together
topic_model = BERTopic(
embedding_model=embedding_model, # Step 1 - Extract embeddings
umap_model=umap_model, # Step 2 - Reduce dimensionality
hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings
vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics
ctfidf_model=ctfidf_model, # Step 5 - Extract topic words
representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
)
```
## **Detailed Overview**
This overview describes each step in more detail such that you can get an intuitive feeling as to what models might fit best at each step in your use case.
### **1. Embed documents**
We start by converting our documents to numerical representations. Although there are many methods for doing so the default in BERTopic is [sentence-transformers](https://github.com/UKPLab/sentence-transformers). These models are often optimized for semantic similarity which helps tremendously in our clustering task. Moreover, they are great for creating either document- or sentence-embeddings.
<br>
In BERTopic, you can choose any sentence-transformers model but two models are set as defaults:
* `"all-MiniLM-L6-v2"`
* `"paraphrase-multilingual-MiniLM-L12-v2"`
The first is an English language model trained specifically for semantic similarity tasks which works quite
well for most use cases. The second model is very similar to the first with one major difference being that the
`multilingual` models work for 50+ languages. This model is quite a bit larger than the first and is only selected if
you select any language other than English.
!!! tip Embedding models
Although BERTopic uses sentence-transformers models as a default, you can choose
any embedding model that fits your use case. Follow the guide [here](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html) for selecting
and customizing your model.
### **2. Dimensionality reduction**
After having created our numerical representations of the documents we have to reduce the dimensionality of these representations. Cluster models typically have difficulty handling high dimensional data due to the curse of dimensionality. There are great approaches that can reduce dimensionality, such as PCA, but as a default [UMAP](https://github.com/lmcinnes/umap) is selected in BERTopic. It is a technique that can keep some of a dataset's local and global structure when reducing its dimensionality. This structure is important to keep as it contains the information necessary to create clusters of semantically similar documents.
!!! tip Dimensionality reduction models
Although BERTopic uses UMAP as a default, you can choose
any dimensionality reduction model that fits your use case. Follow the guide [here](https://maartengr.github.io/BERTopic/getting_started/dim_reduction/dim_reduction.html) for selecting
and customizing your model.
### **3. Cluster Documents**
After having reduced our embeddings, we can start clustering our data. For that, we leverage a density-based clustering technique, HDBSCAN. It can find clusters of different shapes and has the nice feature of identifying outliers where possible. As a result, we do not force documents into a cluster where they might not belong. This will improve the resulting topic representation as there is less noise to draw from.
!!! tip Cluster models
Although BERTopic uses HDBSCAN as a default, you can choose
any cluster model that fits your use case. Follow the guide [here](https://maartengr.github.io/BERTopic/getting_started/clustering/clustering.html) for selecting
and customizing your model.
### **4. Bag-of-words**
Before we can start creating the topic representation we first need to select a technique that allows for modularity in BERTopic's algorithm. When we use HDBSCAN as a cluster model, we may assume that our clusters have different degrees of density and different shapes. This means that a centroid-based topic representation technique might not be the best-fitting model. In other words, we want a topic representation technique that makes little to no assumption on the expected structure of the clusters.
<br>
To do this, we first combine all documents in a cluster into a single document. That, very long, document then represents the cluster. Then, we can count how often each word appears in each cluster. This generates something called a bag-of-words representation in which the frequency of each word in each cluster can be found. This bag-of-words representation is therefore on a cluster level and not on a document level. This distinction is important as we are interested in words on a topic level (i.e., cluster level). By using a bag-of-words representation, no assumption is made concerning the structure of the clusters. Moreover, the bag-of-words representation is L1-normalized to account for clusters that have different sizes.
!!! tip Bag-of-words and tokenization
There are many ways you can tune or change the bag-of-words step. This step allows for processing the documents however you want without affecting the first step, embedding the documents. You can follow the guide [here](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html) for more information about tokenization options in BERTopic.
### **5. Topic representation**
From the generated bag-of-words representation, we want to know what makes one cluster different from another. Which words are typical for cluster 1 and not so much for all other clusters? To solve this, we need to modify TF-IDF such that it considers topics (i.e., clusters) instead of documents.
<br>
When you apply TF-IDF as usual on a set of documents, what you are doing is comparing the importance of
words between documents. Now, what if, we instead treat all documents in a single category (e.g., a cluster) as a single document and then apply TF-IDF? The result would be importance scores for words within a cluster. The more important words are within a cluster, the more it is representative of that topic. In other words, if we extract the most important words per cluster, we get descriptions of **topics**! This model is called **class-based TF-IDF**:
<br><br>
<img class="w-6/12" src="c-TF-IDF.svg">
<br>
Each cluster is converted to a single document instead of a set of documents. Then, we extract the frequency of word `x` in class `c`, where `c` refers to the cluster we created before. This results in our class-based `tf` representation. This representation is L1-normalized to account for the differences in topic sizes.
<br><br>
Then, we take the logarithm of one plus the average number of words per class `A` divided by the frequency of word `x` across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based `idf` representation. Like with the classic TF-IDF, we then multiply `tf` with `idf` to get the importance score per word in each class. In other words, the classical TF-IDF procedure is **not** used here but a modified version of the algorithm that allows for a much better representation.
!!! tip c-TF-IDF parameters
In the `ClassTfidfTransformer`, there are a few parameters that might be worth exploring, including an option to perform additional BM-25 weighting. You can find more information about that [here](https://maartengr.github.io/BERTopic/getting_started/ctfidf/ctfidf.html).
### **6. (Optional) Fine-tune Topic representation**
After having generated the c-TF-IDF representations, we have a set of words that describe a collection of documents. c-TF-IDF
is a method that can quickly generate accurate topic representations. However, with the fast developments in NLP-world, new
and exciting methods are released weekly. In order to keep up with what is happening, there is the possibility to further fine-tune
these c-TF-IDF topics using GPT, T5, KeyBERT, Spacy, and other techniques. Many are implemented in BERTopic for you to use and play around with.
More specifically, we can consider the c-TF-IDF generated topics to be candidate topics. They each contain a set of keywords and
representative documents that we can use to further fine-tune the topic representations. Having a set of representative documents
for each topic is huge advantage as it allows for fine-tuning on a reduced number of documents. This reduces computation for
large models as they only need to operate on that small set of representative documents for each topic. As a result,
large language models like GPT and T5 becomes feasible in production settings and typically take less wall time than the dimensionality reduction
and clustering steps.
The following models are implemented in `bertopic.representation`:
* `MaximalMarginalRelevance`
* `PartOfSpeech`
* `KeyBERTInspired`
* `ZeroShotClassification`
* `TextGeneration` (HuggingFace)
* `Cohere`
* `OpenAI`
* `LangChain`
* `LiteLLM`
* `LlamaCPP`
!!! tip Models
There are roughly two sets of models. **First** are the non-generative set of models that you can find [here](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html). These include models that focus on enhancing the keywords in the topic representations. **Second** are the generative models that attempt to label or summarize the topics instead. You can find an overview of [implemented LLMs here](https://maartengr.github.io/BERTopic/getting_started/representation/llm).
File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 138 KiB

@@ -0,0 +1,49 @@
<svg width="320" height="252" viewBox="0 0 320 252" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="202" y="210" width="118" height="38" fill="#64B5F6"/>
<rect x="294" y="200" width="20" height="8" fill="#64B5F6"/>
<rect x="266" y="200" width="20" height="8" fill="#64B5F6"/>
<rect x="238" y="200" width="20" height="8" fill="#64B5F6"/>
<rect x="210" y="200" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="228.256" y="237.939">SBERT</tspan></text>
<rect x="202" y="170" width="118" height="38" fill="#E57373"/>
<rect x="294" y="160" width="20" height="8" fill="#E57373"/>
<rect x="266" y="160" width="20" height="8" fill="#E57373"/>
<rect x="238" y="160" width="20" height="8" fill="#E57373"/>
<rect x="210" y="160" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="231.254" y="197.939">UMAP</tspan></text>
<rect x="202" y="130" width="118" height="38" fill="#4DB6AC"/>
<rect x="294" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="266" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="238" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="210" y="120" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="211.342" y="157.939">HDBSCAN</tspan></text>
<rect x="202" y="90" width="118" height="38" fill="#FFD54F"/>
<rect x="294" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="266" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="238" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="210" y="80" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="208.346" y="113.161">CountVectorizer</tspan></text>
<rect x="202" y="50" width="118" height="38" fill="#90A4AE"/>
<rect x="294" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="266" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="238" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="210" y="40" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="216.938" y="77.9395">c-TF-IDF</tspan></text>
<rect x="202" y="10" width="118" height="38" fill="#3F51B5"/>
<rect x="294" width="20" height="8" fill="#3F51B5"/>
<rect x="266" width="20" height="8" fill="#3F51B5"/>
<rect x="238" width="20" height="8" fill="#3F51B5"/>
<rect x="210" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="231.065" y="25.0576">Optional&#10;</tspan><tspan x="220.271" y="42.0576">Fine-tuning</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="85.4023" y="65.7637">Weighting scheme</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="138.938" y="111.764">Tokenizer</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="136.312" y="153.764">Clustering</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="43.1602" y="193.764">Dimensionality Reduction</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="124.301" y="233.764">Embeddings</tspan></text>
<path d="M169.354 203.646C169.158 203.451 168.842 203.451 168.646 203.646L165.464 206.828C165.269 207.024 165.269 207.34 165.464 207.536C165.66 207.731 165.976 207.731 166.172 207.536L169 204.707L171.828 207.536C172.024 207.731 172.34 207.731 172.536 207.536C172.731 207.34 172.731 207.024 172.536 206.828L169.354 203.646ZM169.5 218L169.5 204L168.5 204L168.5 218L169.5 218Z" fill="black"/>
<path d="M169.354 75.6464C169.158 75.4512 168.842 75.4512 168.646 75.6464L165.464 78.8284C165.269 79.0237 165.269 79.3403 165.464 79.5355C165.66 79.7308 165.976 79.7308 166.172 79.5355L169 76.7071L171.828 79.5355C172.024 79.7308 172.34 79.7308 172.536 79.5355C172.731 79.3403 172.731 79.0237 172.536 78.8284L169.354 75.6464ZM169.5 90L169.5 76L168.5 76L168.5 90L169.5 90Z" fill="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="36.6289" y="25.7637">Fine-tune Representations</tspan></text>
<path d="M169.354 35.6464C169.158 35.4512 168.842 35.4512 168.646 35.6464L165.464 38.8284C165.269 39.0237 165.269 39.3403 165.464 39.5355C165.66 39.7308 165.976 39.7308 166.172 39.5355L169 36.7071L171.828 39.5355C172.024 39.7308 172.34 39.7308 172.536 39.5355C172.731 39.3403 172.731 39.0237 172.536 38.8284L169.354 35.6464ZM169.5 50L169.5 36L168.5 36L168.5 50L169.5 50Z" fill="black"/>
<path d="M169.354 120.646C169.158 120.451 168.842 120.451 168.646 120.646L165.464 123.828C165.269 124.024 165.269 124.34 165.464 124.536C165.66 124.731 165.976 124.731 166.172 124.536L169 121.707L171.828 124.536C172.024 124.731 172.34 124.731 172.536 124.536C172.731 124.34 172.731 124.024 172.536 123.828L169.354 120.646ZM169.5 135L169.5 121L168.5 121L168.5 135L169.5 135Z" fill="black"/>
<path d="M169.354 162.646C169.158 162.451 168.842 162.451 168.646 162.646L165.464 165.828C165.269 166.024 165.269 166.34 165.464 166.536C165.66 166.731 165.976 166.731 166.172 166.536L169 163.707L171.828 166.536C172.024 166.731 172.34 166.731 172.536 166.536C172.731 166.34 172.731 166.024 172.536 165.828L169.354 162.646ZM169.5 177L169.5 163L168.5 163L168.5 177L169.5 177Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 6.2 KiB

@@ -0,0 +1,257 @@
<svg width="992" height="452" viewBox="0 0 992 452" fill="none" xmlns="http://www.w3.org/2000/svg">
<circle cx="803.5" cy="229.5" r="5.5" fill="black"/>
<circle cx="823.5" cy="229.5" r="5.5" fill="black"/>
<circle cx="843.5" cy="229.5" r="5.5" fill="black"/>
<rect x="121" y="411" width="118" height="38" fill="#64B5F6"/>
<rect x="213" y="401" width="20" height="8" fill="#64B5F6"/>
<rect x="185" y="401" width="20" height="8" fill="#64B5F6"/>
<rect x="157" y="401" width="20" height="8" fill="#64B5F6"/>
<rect x="129" y="401" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="147.256" y="438.939">SBERT</tspan></text>
<rect x="253" y="411" width="118" height="38" fill="#64B5F6"/>
<rect x="345" y="401" width="20" height="8" fill="#64B5F6"/>
<rect x="317" y="401" width="20" height="8" fill="#64B5F6"/>
<rect x="289" y="401" width="20" height="8" fill="#64B5F6"/>
<rect x="261" y="401" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="280.936" y="438.939">SpaCy</tspan></text>
<rect x="448" y="411" width="118" height="38" fill="#64B5F6"/>
<rect x="540" y="401" width="20" height="8" fill="#64B5F6"/>
<rect x="512" y="401" width="20" height="8" fill="#64B5F6"/>
<rect x="484" y="401" width="20" height="8" fill="#64B5F6"/>
<rect x="456" y="401" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="479.623" y="433.764">Transformers</tspan></text>
<circle cx="387.5" cy="429.5" r="5.5" fill="black"/>
<circle cx="406.5" cy="429.5" r="5.5" fill="black"/>
<circle cx="428.5" cy="429.5" r="5.5" fill="black"/>
<path d="M466.944 437.706C463.02 437.706 458.787 435.245 458.787 429.845C458.787 424.445 463.02 421.984 466.944 421.984C469.123 421.984 471.134 422.702 472.625 424.009C474.242 425.444 475.1 427.469 475.1 429.845C475.1 432.222 474.242 434.233 472.625 435.667C471.134 436.975 469.109 437.706 466.944 437.706Z" fill="url(#paint0_radial_31_1943)"/>
<path d="M466.944 437.706C463.02 437.706 458.787 435.245 458.787 429.845C458.787 424.445 463.02 421.984 466.944 421.984C469.123 421.984 471.134 422.702 472.625 424.009C474.242 425.444 475.1 427.469 475.1 429.845C475.1 432.222 474.242 434.233 472.625 435.667C471.134 436.975 469.109 437.706 466.944 437.706Z" fill="url(#paint1_linear_31_1943)"/>
<path d="M473.678 425.172C474.428 426.382 474.819 427.822 474.819 429.423C474.819 431.8 473.961 433.811 472.344 435.245C470.853 436.553 468.828 437.284 466.663 437.284C464.123 437.284 461.454 436.251 459.887 434.058C461.4 436.542 464.245 437.706 466.944 437.706C469.109 437.706 471.134 436.975 472.625 435.667C474.242 434.233 475.1 432.222 475.1 429.845C475.1 428.054 474.612 426.462 473.678 425.172V425.172Z" fill="#EB8F00"/>
<path opacity="0.8" d="M462.177 432.306C463.443 432.306 464.469 431.28 464.469 430.014C464.469 428.748 463.443 427.722 462.177 427.722C460.911 427.722 459.884 428.748 459.884 430.014C459.884 431.28 460.911 432.306 462.177 432.306Z" fill="url(#paint2_radial_31_1943)"/>
<path opacity="0.8" d="M471.627 430.534C472.893 430.534 473.919 429.508 473.919 428.242C473.919 426.976 472.893 425.95 471.627 425.95C470.361 425.95 469.334 426.976 469.334 428.242C469.334 429.508 470.361 430.534 471.627 430.534Z" fill="url(#paint3_radial_31_1943)"/>
<path d="M471.627 427.947C470.544 428.945 469.109 429.733 467.45 430.113C465.791 430.492 464.159 430.422 462.753 429.986C462.5 429.902 462.317 430.239 462.542 430.408C463.962 431.463 465.861 431.913 467.759 431.463C469.658 431.027 471.177 429.803 471.992 428.228C472.133 427.989 471.823 427.764 471.627 427.947V427.947Z" fill="#422B0D"/>
<path d="M464.82 427.638L464.792 427.609C464.778 427.595 464.75 427.581 464.722 427.553C464.694 427.539 464.666 427.511 464.637 427.497C464.609 427.469 464.567 427.441 464.525 427.413C464.483 427.384 464.441 427.356 464.398 427.342C464.356 427.314 464.314 427.3 464.272 427.3C464.23 427.286 464.202 427.286 464.187 427.286H464.159H464.145H464.173L464.103 427.3C464.089 427.3 464.103 427.3 464.103 427.3H464.117C464.131 427.3 464.117 427.3 464.117 427.3H464.103C464.089 427.314 464.061 427.328 464.033 427.342C464.005 427.37 463.962 427.398 463.934 427.427C463.906 427.455 463.878 427.497 463.85 427.539C463.794 427.623 463.752 427.694 463.723 427.75C463.695 427.806 463.667 427.834 463.667 427.834L463.639 427.891C463.498 428.13 463.189 428.228 462.936 428.102C462.767 428.017 462.669 427.863 462.655 427.694V427.539C462.669 427.441 462.683 427.3 462.739 427.131C462.795 426.963 462.894 426.752 463.077 426.555C463.175 426.456 463.287 426.344 463.428 426.273C463.456 426.245 463.498 426.231 463.541 426.217C463.583 426.203 463.611 426.175 463.667 426.161L463.737 426.133C463.766 426.119 463.794 426.119 463.808 426.119L463.878 426.105L463.92 426.091H463.977L464.047 426.077H464.173C464.258 426.077 464.342 426.077 464.427 426.091C464.595 426.119 464.736 426.175 464.862 426.231C465.116 426.358 465.27 426.513 465.397 426.639C465.453 426.709 465.509 426.766 465.537 426.822C465.58 426.878 465.608 426.934 465.636 426.977C465.664 427.019 465.664 427.047 465.678 427.061C465.678 427.075 465.692 427.089 465.692 427.089C465.791 427.342 465.65 427.623 465.369 427.722C465.172 427.792 464.961 427.75 464.82 427.638V427.638Z" fill="#422B0D"/>
<path d="M469.644 426.695L469.616 426.667C469.602 426.653 469.573 426.639 469.545 426.611C469.517 426.597 469.489 426.569 469.461 426.555C469.433 426.527 469.391 426.498 469.348 426.47C469.306 426.442 469.264 426.414 469.222 426.4C469.18 426.372 469.137 426.358 469.095 426.358C469.067 426.344 469.039 426.344 469.025 426.344H468.983H469.011L468.941 426.358C468.927 426.358 468.941 426.358 468.941 426.358H468.955C468.969 426.358 468.955 426.358 468.955 426.358H468.941C468.927 426.372 468.898 426.386 468.856 426.4C468.828 426.428 468.8 426.456 468.758 426.484C468.73 426.513 468.702 426.555 468.673 426.597C468.617 426.681 468.575 426.752 468.547 426.808C468.519 426.864 468.491 426.892 468.491 426.892L468.462 426.934C468.322 427.173 468.012 427.272 467.759 427.145C467.591 427.061 467.492 426.906 467.478 426.738C467.478 426.738 467.478 426.681 467.492 426.583C467.506 426.484 467.52 426.344 467.577 426.175C467.633 426.006 467.731 425.795 467.914 425.598C468.012 425.5 468.125 425.388 468.266 425.317C468.294 425.303 468.336 425.275 468.378 425.261C468.42 425.247 468.448 425.219 468.505 425.205L468.589 425.177C468.617 425.163 468.645 425.163 468.659 425.163L468.73 425.148L468.772 425.134H468.828L468.898 425.12H469.025C469.109 425.12 469.194 425.12 469.278 425.134C469.447 425.163 469.587 425.219 469.714 425.275C469.967 425.402 470.122 425.556 470.248 425.683C470.305 425.753 470.361 425.809 470.389 425.866C470.431 425.922 470.459 425.978 470.487 426.02C470.516 426.063 470.516 426.091 470.53 426.105C470.544 426.119 470.544 426.133 470.544 426.133C470.642 426.386 470.502 426.667 470.22 426.766C469.995 426.864 469.784 426.822 469.644 426.695V426.695Z" fill="#422B0D"/>
<path d="M464.029 432.62C463.993 432.27 464.009 432.07 463.74 432.007C463.439 431.936 463.171 432.152 463.089 432.46C462.874 433.284 463.24 433.859 463.24 433.859C462.719 433.756 462.285 433.25 462.285 433.25C462.013 432.931 461.631 432.137 461.367 431.811C461.206 431.613 460.943 431.451 460.682 431.588C460.055 431.917 460.749 433.036 460.983 433.334C461.248 433.673 460.292 432.589 460.152 432.353C459.89 431.914 459.569 431.797 459.301 432.026C459.032 432.254 459.247 432.824 459.37 433.04C460.119 434.355 460.773 434.677 460.773 434.677C460.773 434.677 460.077 434.342 459.484 433.624C458.924 432.946 458.392 433.662 458.765 434.264C458.844 434.392 458.994 434.722 459.478 435.14C459.849 435.46 459.607 435.268 459.478 435.14C458.983 434.652 458.456 435.172 458.615 435.598C458.799 436.089 459.399 436.543 459.806 436.844C460.596 437.429 461.582 437.897 462.58 437.865C463.691 437.83 464.511 437.367 464.757 435.993C464.871 435.355 464.515 434.376 464.41 434.126C464.065 433.316 464.048 432.818 464.029 432.62Z" fill="url(#paint4_radial_31_1943)"/>
<path d="M462.913 436.038C462.764 435.829 462.344 434.856 463.129 433.86L463.356 434.004C462.468 435.1 463.089 436.037 463.197 436.249C463.197 436.249 463.055 436.24 462.913 436.038V436.038Z" fill="#EB8F00"/>
<path d="M464.497 434.036C464.148 433.36 464.123 433.008 464.099 432.623C464.072 432.344 464.052 432 463.716 431.921C463.573 431.887 463.373 431.907 463.196 432.102C462.753 432.595 463.106 433.739 463.106 433.739C462.469 433.441 462.362 433.178 461.911 432.499C461.714 432.201 461.4 431.65 461.113 431.489C460.908 431.374 460.604 431.46 460.452 431.665C460.111 432.123 460.72 433.053 460.72 433.053C460.72 433.053 460.384 432.564 460.237 432.308C459.988 431.873 459.533 431.633 459.197 431.935C458.648 432.429 459.714 433.756 459.714 433.756C459.714 433.756 459.223 432.978 458.734 433.307C458.48 433.479 458.38 433.926 458.644 434.357C458.761 434.552 459.057 434.898 459.057 434.898C459.057 434.898 458.938 434.822 458.738 434.887C458.516 434.96 458.326 435.235 458.488 435.666C458.675 436.169 459.229 436.605 459.734 436.969C460.883 437.799 462.226 437.85 462.226 437.85C462.226 437.85 460.753 437.567 459.811 436.709C459.516 436.441 458.346 435.624 458.764 435.199C458.862 435.098 459.042 435.047 459.308 435.258C460.243 436.003 460.562 435.83 460.562 435.83C460.596 435.701 460.503 435.646 460.225 435.482C460.032 435.368 459.768 435.212 459.572 435.044C459.093 434.629 458.551 433.908 458.848 433.587C459.007 433.416 459.191 433.452 459.37 433.637C460.613 434.905 460.905 434.726 460.905 434.726C460.905 434.726 460.943 434.589 460.551 434.244C460.25 433.981 459.88 433.656 459.423 432.86C459.302 432.649 459.24 432.274 459.417 432.126C459.657 431.924 459.877 432.057 460.076 432.354C460.317 432.715 460.661 433.16 461.002 433.487C461.416 433.883 461.638 434.004 461.652 433.99C461.75 433.883 461.327 433.468 461.139 433.209C460.475 432.294 460.485 431.993 460.623 431.755C460.71 431.605 460.95 431.537 461.24 431.901C461.864 432.621 462.179 433.793 463.355 434.001C463.355 434.001 463.4 433.93 463.295 433.549C463.168 433.151 463.147 432.812 463.212 432.516C463.283 432.188 463.546 432.095 463.691 432.13C463.885 432.177 463.863 432.503 463.863 432.503C463.863 432.503 463.874 432.623 463.88 432.677C463.902 432.914 463.948 433.406 464.275 434.165C464.405 434.469 465.103 435.718 464.158 437.28C464.158 437.28 464.559 437.383 465.019 437.49C465.019 437.49 465.2 436.85 465.116 435.856C465.013 434.605 464.732 434.492 464.497 434.036V434.036Z" fill="#EB8F00"/>
<path d="M458.077 435.438C458.082 435.399 458.087 435.365 458.094 435.335C458.084 435.368 458.079 435.401 458.077 435.438Z" fill="url(#paint5_radial_31_1943)"/>
<path d="M469.843 432.62C469.879 432.27 469.865 432.077 470.132 432.007C470.419 431.932 470.696 432.153 470.783 432.46C470.984 433.18 470.599 433.859 470.599 433.859C471.119 433.756 471.56 433.257 471.56 433.257C471.832 432.938 472.242 432.137 472.505 431.811C472.666 431.613 472.856 431.485 473.131 431.586C473.701 431.797 473.106 432.95 472.872 433.25C472.607 433.589 472.775 433.534 473.133 433.142C473.491 432.746 473.681 432.523 473.822 432.288C474.083 431.849 474.397 431.883 474.571 432.026C474.844 432.251 474.625 432.824 474.502 433.04C473.753 434.355 474.388 433.624 474.388 433.624C474.948 432.946 475.48 433.662 475.107 434.264C475.028 434.392 474.878 434.722 474.394 435.14C474.023 435.46 474.265 435.268 474.394 435.14C474.889 434.652 475.416 435.172 475.257 435.598C475.073 436.089 474.473 436.543 474.066 436.844C473.276 437.429 472.29 437.897 471.292 437.865C470.181 437.83 469.361 437.367 469.115 435.993C469.001 435.355 469.357 434.376 469.462 434.126C469.807 433.316 469.824 432.818 469.843 432.62V432.62Z" fill="url(#paint6_radial_31_1943)"/>
<path d="M470.959 436.038C471.108 435.829 471.528 434.856 470.743 433.86L470.516 434.004C471.404 435.1 470.783 436.037 470.675 436.249C470.675 436.249 470.817 436.24 470.959 436.038V436.038Z" fill="#EB8F00"/>
<path d="M469.375 434.036C469.724 433.36 469.749 433.008 469.773 432.623C469.8 432.344 469.82 432 470.156 431.921C470.299 431.887 470.499 431.907 470.676 432.102C471.119 432.595 470.766 433.739 470.766 433.739C471.403 433.441 471.51 433.178 471.961 432.499C472.158 432.201 472.472 431.65 472.759 431.489C472.964 431.374 473.268 431.46 473.42 431.665C473.761 432.123 473.152 433.053 473.152 433.053C473.152 433.053 473.488 432.564 473.635 432.308C473.884 431.873 474.339 431.633 474.675 431.935C475.224 432.429 474.158 433.756 474.158 433.756C474.158 433.756 474.649 432.978 475.138 433.307C475.392 433.479 475.492 433.926 475.228 434.357C475.111 434.552 474.815 434.898 474.815 434.898C474.815 434.898 474.934 434.822 475.134 434.887C475.356 434.96 475.546 435.235 475.384 435.666C475.197 436.169 474.643 436.605 474.138 436.969C472.989 437.799 471.646 437.85 471.646 437.85C471.646 437.85 473.119 437.567 474.061 436.709C474.356 436.441 475.526 435.624 475.108 435.199C475.01 435.098 474.83 435.047 474.564 435.258C473.629 436.003 473.31 435.83 473.31 435.83C473.276 435.701 473.369 435.646 473.647 435.482C473.84 435.368 474.104 435.212 474.3 435.044C474.779 434.629 475.321 433.908 475.024 433.587C474.865 433.416 474.681 433.452 474.502 433.637C473.259 434.905 472.967 434.726 472.967 434.726C472.967 434.726 472.929 434.589 473.321 434.244C473.622 433.981 473.992 433.656 474.449 432.86C474.57 432.649 474.632 432.274 474.455 432.126C474.215 431.924 473.995 432.057 473.796 432.354C473.555 432.715 473.211 433.16 472.87 433.487C472.456 433.883 472.234 434.004 472.22 433.99C472.122 433.883 472.545 433.468 472.733 433.209C473.397 432.294 473.387 431.993 473.249 431.755C473.162 431.605 472.922 431.537 472.632 431.901C472.005 432.624 471.69 433.795 470.516 434.004C470.516 434.004 470.471 433.933 470.576 433.552C470.703 433.154 470.724 432.815 470.659 432.519C470.587 432.191 470.324 432.098 470.18 432.133C469.985 432.18 470.008 432.506 470.008 432.506C470.008 432.506 469.997 432.625 469.991 432.68C469.969 432.917 469.922 433.409 469.596 434.168C469.465 434.472 468.766 435.733 469.713 437.283C469.713 437.283 469.312 437.386 468.852 437.492C468.852 437.492 468.671 436.853 468.755 435.858C468.859 434.605 469.14 434.492 469.375 434.036V434.036Z" fill="#EB8F00"/>
<path d="M475.795 435.438C475.79 435.399 475.785 435.365 475.778 435.335C475.788 435.368 475.793 435.401 475.795 435.438Z" fill="url(#paint7_radial_31_1943)"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="29.3008" y="432.764">Embeddings</tspan></text>
<rect x="121" y="331" width="118" height="38" fill="#E57373"/>
<rect x="213" y="321" width="20" height="8" fill="#E57373"/>
<rect x="185" y="321" width="20" height="8" fill="#E57373"/>
<rect x="157" y="321" width="20" height="8" fill="#E57373"/>
<rect x="129" y="321" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="150.254" y="358.939">UMAP</tspan></text>
<rect x="253" y="331" width="118" height="38" fill="#E57373"/>
<rect x="345" y="321" width="20" height="8" fill="#E57373"/>
<rect x="317" y="321" width="20" height="8" fill="#E57373"/>
<rect x="289" y="321" width="20" height="8" fill="#E57373"/>
<rect x="261" y="321" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="291.902" y="358.939">PCA</tspan></text>
<rect x="448" y="331" width="118" height="38" fill="#E57373"/>
<rect x="540" y="321" width="20" height="8" fill="#E57373"/>
<rect x="512" y="321" width="20" height="8" fill="#E57373"/>
<rect x="484" y="321" width="20" height="8" fill="#E57373"/>
<rect x="456" y="321" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="456.886" y="356.058">TruncatedSVD</tspan></text>
<circle cx="387.5" cy="349.5" r="5.5" fill="black"/>
<circle cx="407.5" cy="349.5" r="5.5" fill="black"/>
<circle cx="427.5" cy="349.5" r="5.5" fill="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="12.8008" y="346.764">Dimensionality&#10;</tspan><tspan x="41.875" y="360.764">Reduction</tspan></text>
<rect x="121" y="251" width="118" height="38" fill="#4DB6AC"/>
<rect x="213" y="241" width="20" height="8" fill="#4DB6AC"/>
<rect x="185" y="241" width="20" height="8" fill="#4DB6AC"/>
<rect x="157" y="241" width="20" height="8" fill="#4DB6AC"/>
<rect x="129" y="241" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="130.342" y="278.939">HDBSCAN</tspan></text>
<rect x="253" y="251" width="118" height="38" fill="#4DB6AC"/>
<rect x="345" y="241" width="20" height="8" fill="#4DB6AC"/>
<rect x="317" y="241" width="20" height="8" fill="#4DB6AC"/>
<rect x="289" y="241" width="20" height="8" fill="#4DB6AC"/>
<rect x="261" y="241" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="269.246" y="278.939">k-Means</tspan></text>
<rect x="448" y="251" width="118" height="38" fill="#4DB6AC"/>
<rect x="540" y="241" width="20" height="8" fill="#4DB6AC"/>
<rect x="512" y="241" width="20" height="8" fill="#4DB6AC"/>
<rect x="484" y="241" width="20" height="8" fill="#4DB6AC"/>
<rect x="456" y="241" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="472.709" y="278.939">BIRCH</tspan></text>
<circle cx="387.5" cy="269.5" r="5.5" fill="black"/>
<circle cx="407.5" cy="269.5" r="5.5" fill="black"/>
<circle cx="427.5" cy="269.5" r="5.5" fill="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="41.3125" y="273.764">Clustering</tspan></text>
<rect x="121" y="171" width="118" height="38" fill="#FFD54F"/>
<rect x="213" y="161" width="20" height="8" fill="#FFD54F"/>
<rect x="185" y="161" width="20" height="8" fill="#FFD54F"/>
<rect x="157" y="161" width="20" height="8" fill="#FFD54F"/>
<rect x="129" y="161" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="127.346" y="195.161">CountVectorizer</tspan></text>
<rect x="253" y="171" width="118" height="38" fill="#FFD54F"/>
<rect x="345" y="161" width="20" height="8" fill="#FFD54F"/>
<rect x="317" y="161" width="20" height="8" fill="#FFD54F"/>
<rect x="289" y="161" width="20" height="8" fill="#FFD54F"/>
<rect x="261" y="161" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="285.73" y="198.939">Jieba</tspan></text>
<rect x="448" y="171" width="118" height="38" fill="#FFD54F"/>
<rect x="540" y="161" width="20" height="8" fill="#FFD54F"/>
<rect x="512" y="161" width="20" height="8" fill="#FFD54F"/>
<rect x="484" y="161" width="20" height="8" fill="#FFD54F"/>
<rect x="456" y="161" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="486.385" y="198.939">POS</tspan></text>
<circle cx="387.5" cy="189.5" r="5.5" fill="black"/>
<circle cx="407.5" cy="189.5" r="5.5" fill="black"/>
<circle cx="427.5" cy="189.5" r="5.5" fill="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="44.9375" y="193.764">Tokenizer</tspan></text>
<rect x="662" y="289" width="118" height="38" fill="#64B5F6"/>
<rect x="754" y="279" width="20" height="8" fill="#64B5F6"/>
<rect x="726" y="279" width="20" height="8" fill="#64B5F6"/>
<rect x="698" y="279" width="20" height="8" fill="#64B5F6"/>
<rect x="670" y="279" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="689.936" y="316.939">SpaCy</tspan></text>
<rect x="662" y="249" width="118" height="38" fill="#E57373"/>
<rect x="754" y="239" width="20" height="8" fill="#E57373"/>
<rect x="726" y="239" width="20" height="8" fill="#E57373"/>
<rect x="698" y="239" width="20" height="8" fill="#E57373"/>
<rect x="670" y="239" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="700.902" y="276.939">PCA</tspan></text>
<rect x="662" y="209" width="118" height="38" fill="#4DB6AC"/>
<rect x="754" y="199" width="20" height="8" fill="#4DB6AC"/>
<rect x="726" y="199" width="20" height="8" fill="#4DB6AC"/>
<rect x="698" y="199" width="20" height="8" fill="#4DB6AC"/>
<rect x="670" y="199" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="678.246" y="236.939">k-Means</tspan></text>
<rect x="662" y="169" width="118" height="38" fill="#FFD54F"/>
<rect x="754" y="159" width="20" height="8" fill="#FFD54F"/>
<rect x="726" y="159" width="20" height="8" fill="#FFD54F"/>
<rect x="698" y="159" width="20" height="8" fill="#FFD54F"/>
<rect x="670" y="159" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="668.346" y="193.161">CountVectorizer</tspan></text>
<rect x="662" y="129" width="118" height="38" fill="#90A4AE"/>
<rect x="754" y="119" width="20" height="8" fill="#90A4AE"/>
<rect x="726" y="119" width="20" height="8" fill="#90A4AE"/>
<rect x="698" y="119" width="20" height="8" fill="#90A4AE"/>
<rect x="670" y="119" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="676.938" y="156.939">c-TF-IDF</tspan></text>
<circle cx="387.5" cy="110.5" r="5.5" fill="black"/>
<circle cx="407.5" cy="110.5" r="5.5" fill="black"/>
<circle cx="427.5" cy="110.5" r="5.5" fill="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="40.8086" y="106.764">Weighting&#10;</tspan><tspan x="57.1094" y="120.764">scheme</tspan></text>
<rect x="121" y="91" width="118" height="38" fill="#90A4AE"/>
<rect x="213" y="81" width="20" height="8" fill="#90A4AE"/>
<rect x="185" y="81" width="20" height="8" fill="#90A4AE"/>
<rect x="157" y="81" width="20" height="8" fill="#90A4AE"/>
<rect x="129" y="81" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="135.938" y="117.939">c-TF-IDF</tspan></text>
<rect x="253" y="91" width="118" height="38" fill="#90A4AE"/>
<rect x="345" y="81" width="20" height="8" fill="#90A4AE"/>
<rect x="317" y="81" width="20" height="8" fill="#90A4AE"/>
<rect x="289" y="81" width="20" height="8" fill="#90A4AE"/>
<rect x="261" y="81" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="271.326" y="107.058">c-TF-IDF + &#10;</tspan><tspan x="292.025" y="124.058">BM25</tspan></text>
<rect x="448" y="91" width="118" height="38" fill="#90A4AE"/>
<rect x="540" y="81" width="20" height="8" fill="#90A4AE"/>
<rect x="512" y="81" width="20" height="8" fill="#90A4AE"/>
<rect x="484" y="81" width="20" height="8" fill="#90A4AE"/>
<rect x="456" y="81" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="466.326" y="107.058">c-TF-IDF + &#10;</tspan><tspan x="457.453" y="124.058">Normalization</tspan></text>
<circle cx="387.5" cy="30.5" r="5.5" fill="black"/>
<circle cx="407.5" cy="30.5" r="5.5" fill="black"/>
<circle cx="427.5" cy="30.5" r="5.5" fill="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="10.3984" y="25.7637">Representation&#10;</tspan><tspan x="61.4102" y="39.7637">Tuning</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="51.6484" y="11.7637">(optional)&#10;</tspan></text>
<rect x="121" y="11" width="118" height="38" fill="#3F51B5"/>
<rect x="213" y="1" width="20" height="8" fill="#3F51B5"/>
<rect x="185" y="1" width="20" height="8" fill="#3F51B5"/>
<rect x="157" y="1" width="20" height="8" fill="#3F51B5"/>
<rect x="129" y="1" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="135.732" y="37.9395">GPT / T5</tspan></text>
<rect x="253" y="11" width="118" height="38" fill="#3F51B5"/>
<rect x="345" y="1" width="20" height="8" fill="#3F51B5"/>
<rect x="317" y="1" width="20" height="8" fill="#3F51B5"/>
<rect x="289" y="1" width="20" height="8" fill="#3F51B5"/>
<rect x="261" y="1" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="266.932" y="37.9395">KeyBERT</tspan></text>
<rect x="448" y="11" width="118" height="38" fill="#3F51B5"/>
<rect x="540" y="1" width="20" height="8" fill="#3F51B5"/>
<rect x="512" y="1" width="20" height="8" fill="#3F51B5"/>
<rect x="484" y="1" width="20" height="8" fill="#3F51B5"/>
<rect x="456" y="1" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="481.863" y="37.9395">MMR</tspan></text>
<rect x="865" y="308" width="118" height="38" fill="#64B5F6"/>
<rect x="957" y="298" width="20" height="8" fill="#64B5F6"/>
<rect x="929" y="298" width="20" height="8" fill="#64B5F6"/>
<rect x="901" y="298" width="20" height="8" fill="#64B5F6"/>
<rect x="873" y="298" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="889.527" y="335.939">TF-IDF</tspan></text>
<rect x="865" y="268" width="118" height="38" fill="#E57373"/>
<rect x="957" y="258" width="20" height="8" fill="#E57373"/>
<rect x="929" y="258" width="20" height="8" fill="#E57373"/>
<rect x="901" y="258" width="20" height="8" fill="#E57373"/>
<rect x="873" y="258" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="15" font-weight="bold" letter-spacing="0em"><tspan x="871.306" y="293.955">TruncatedSVD</tspan></text>
<rect x="865" y="228" width="118" height="38" fill="#4DB6AC"/>
<rect x="957" y="218" width="20" height="8" fill="#4DB6AC"/>
<rect x="929" y="218" width="20" height="8" fill="#4DB6AC"/>
<rect x="901" y="218" width="20" height="8" fill="#4DB6AC"/>
<rect x="873" y="218" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="890.709" y="255.939">BIRCH</tspan></text>
<rect x="865" y="188" width="118" height="38" fill="#FFD54F"/>
<rect x="957" y="178" width="20" height="8" fill="#FFD54F"/>
<rect x="929" y="178" width="20" height="8" fill="#FFD54F"/>
<rect x="901" y="178" width="20" height="8" fill="#FFD54F"/>
<rect x="873" y="178" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="871.346" y="212.161">CountVectorizer</tspan></text>
<rect x="865" y="148" width="118" height="38" fill="#90A4AE"/>
<rect x="957" y="138" width="20" height="8" fill="#90A4AE"/>
<rect x="929" y="138" width="20" height="8" fill="#90A4AE"/>
<rect x="901" y="138" width="20" height="8" fill="#90A4AE"/>
<rect x="873" y="138" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="15" font-weight="bold" letter-spacing="0em"><tspan x="889.756" y="163.955">c-TF-IDF &#10;</tspan><tspan x="895.264" y="181.955">+ BM25</tspan></text>
<rect x="865" y="108" width="118" height="38" fill="#3F51B5"/>
<rect x="957" y="98" width="20" height="8" fill="#3F51B5"/>
<rect x="929" y="98" width="20" height="8" fill="#3F51B5"/>
<rect x="901" y="98" width="20" height="8" fill="#3F51B5"/>
<rect x="873" y="98" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="904.854" y="133.939">GPT</tspan></text>
<path d="M642.061 231.061C642.646 230.475 642.646 229.525 642.061 228.939L632.515 219.393C631.929 218.808 630.979 218.808 630.393 219.393C629.808 219.979 629.808 220.929 630.393 221.515L638.879 230L630.393 238.485C629.808 239.071 629.808 240.021 630.393 240.607C630.979 241.192 631.929 241.192 632.515 240.607L642.061 231.061ZM579 231.5L641 231.5L641 228.5L579 228.5L579 231.5Z" fill="black"/>
<defs>
<radialGradient id="paint0_radial_31_1943" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(466.944 429.845) scale(8.01)">
<stop offset="0.5" stop-color="#FDE030"/>
<stop offset="0.919" stop-color="#F7C02B"/>
<stop offset="1" stop-color="#F4A223"/>
</radialGradient>
<linearGradient id="paint1_linear_31_1943" x1="466.944" y1="437.706" x2="466.944" y2="421.984" gradientUnits="userSpaceOnUse">
<stop offset="0.158" stop-color="#F4A223"/>
<stop offset="0.333" stop-color="#F7C02B"/>
<stop offset="0.807" stop-color="#FDE030" stop-opacity="0"/>
</linearGradient>
<radialGradient id="paint2_radial_31_1943" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(462.172 430.015) rotate(-10.6121) scale(2.49432 2.36965)">
<stop stop-color="#ED7770"/>
<stop offset="0.9" stop-color="#ED7770" stop-opacity="0"/>
</radialGradient>
<radialGradient id="paint3_radial_31_1943" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(471.628 428.247) rotate(-10.6121) scale(2.49404 2.36938)">
<stop stop-color="#ED7770"/>
<stop offset="0.9" stop-color="#ED7770" stop-opacity="0"/>
</radialGradient>
<radialGradient id="paint4_radial_31_1943" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(462.552 434.229) scale(5.41589)">
<stop offset="0.33" stop-color="#FFF176"/>
<stop offset="1" stop-color="#FFC400"/>
</radialGradient>
<radialGradient id="paint5_radial_31_1943" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(462.104 434.2) scale(5.47088 5.47084)">
<stop offset="0.33" stop-color="#FFF176"/>
<stop offset="1" stop-color="#FFC400"/>
</radialGradient>
<radialGradient id="paint6_radial_31_1943" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(471.321 434.236) rotate(180) scale(5.40773)">
<stop offset="0.33" stop-color="#FFF176"/>
<stop offset="1" stop-color="#FFC400"/>
</radialGradient>
<radialGradient id="paint7_radial_31_1943" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(471.768 434.2) rotate(180) scale(5.471 5.47084)">
<stop offset="0.33" stop-color="#FFF176"/>
<stop offset="1" stop-color="#FFC400"/>
</radialGradient>
</defs>
</svg>

After

Width:  |  Height:  |  Size: 32 KiB

@@ -0,0 +1,3 @@
# `Backends`
::: bertopic.backend
@@ -0,0 +1,3 @@
# `BERTopic`
::: bertopic._bertopic.BERTopic
@@ -0,0 +1,3 @@
# `BaseCluster`
::: bertopic.cluster._base.BaseCluster
@@ -0,0 +1,3 @@
# `BaseCluster`
::: bertopic.cluster._base.BaseCluster
@@ -0,0 +1,3 @@
# `c-TF-IDF`
::: bertopic.vectorizers.ClassTfidfTransformer
@@ -0,0 +1,3 @@
# `BaseDimensionalityReduction`
::: bertopic.dimensionality._base.BaseDimensionalityReduction
@@ -0,0 +1,3 @@
# `Plotting`
::: bertopic.plotting
@@ -0,0 +1,3 @@
# `Barchart`
::: bertopic.plotting._barchart.visualize_barchart
@@ -0,0 +1,3 @@
# `Distribution`
::: bertopic.plotting._distribution.visualize_distribution
@@ -0,0 +1,3 @@
# `Documents with DataMapPlot`
::: bertopic.plotting._datamap.visualize_document_datamap
@@ -0,0 +1,3 @@
# `Documents`
::: bertopic.plotting._documents.visualize_documents
@@ -0,0 +1,3 @@
# `DTM`
::: bertopic.plotting._topics_over_time.visualize_topics_over_time
@@ -0,0 +1,3 @@
# `Heatmap`
::: bertopic.plotting._heatmap.visualize_heatmap
@@ -0,0 +1,3 @@
# `Hierarchical Documents`
::: bertopic.plotting._hierarchical_documents.visualize_hierarchical_documents
@@ -0,0 +1,3 @@
# `Hierarchy`
::: bertopic.plotting._hierarchy.visualize_hierarchy
@@ -0,0 +1,3 @@
# `Term Score Decline`
::: bertopic.plotting._term_rank.visualize_term_rank
@@ -0,0 +1,3 @@
# `Topics`
::: bertopic.plotting._topics.visualize_topics
@@ -0,0 +1,3 @@
# `Topics per Class`
::: bertopic.plotting._topics_per_class.visualize_topics_per_class
@@ -0,0 +1,3 @@
# `Representations`
::: bertopic.representation
@@ -0,0 +1,3 @@
# `Vectorizers`
::: bertopic.vectorizers._online_cv.OnlineCountVectorizer
File diff suppressed because it is too large Load Diff
+350
View File
@@ -0,0 +1,350 @@
---
hide:
- navigation
---
# Frequently Asked Questions
## **Why are the results not consistent between runs?**
Due to the stochastic nature of UMAP, the results from BERTopic might differ even if you run the same code multiple times. Using custom embeddings allows you to try out BERTopic several times until you find the topics that suit you best. You only need to generate the embeddings themselves once and run BERTopic several times
with different parameters.
If you want to reproduce the results, at the expense of [performance](https://umap-learn.readthedocs.io/en/latest/reproducibility.html), you can set a `random_state` in UMAP to prevent
any stochastic behavior:
```python
from bertopic import BERTopic
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=5,
min_dist=0.0, metric='cosine', random_state=42)
topic_model = BERTopic(umap_model=umap_model)
```
## **Which embedding model should I choose?**
Unfortunately, there is not a definitive list of the best models for each language, this highly depends on your data, the model, and your specific use case. However, the default model in BERTopic (`"all-MiniLM-L6-v2"`) works great for **English** documents. In contrast, for **multi-lingual** documents or any other language, `"paraphrase-multilingual-MiniLM-L12-v2"` has shown great performance.
If you want to use a model that provides a higher quality, but takes more computing time, then I would advise using `all-mpnet-base-v2` and `paraphrase-multilingual-mpnet-base-v2` instead.
**MTEB Leaderboard**
New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look.
Many of these models can be used with `SentenceTransformers` in BERTopic, like so:
```python
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
topic_model = BERTopic(embedding_model=embedding_model)
```
**SentenceTransformers**
[SentenceTransformers](https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models) work typically quite well
and are the preferred models to use. They are great at generating document embeddings and have several
multi-lingual versions available.
**🤗 transformers**
BERTopic allows you to use any 🤗 transformers model. These models are typically embeddings created on a word/sentence level but can easily be pooled using Flair (see Guides/Embeddings). If you have a specific language for which you want to generate embeddings, you can choose the model [here](https://huggingface.co/models).
## **How do I reduce topic outliers?**
There are several ways we can reduce outliers.
First, the amount of datapoint classified as outliers is handled by the `min_samples` parameters in HDBSCAN. This value is automatically set to the
same value of `min_cluster_size`. However, you can set it independently if you want to reduce the number of generated outliers. Lowering this value will
result in less noise being generated.
```python
from bertopic import BERTopic
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean',
cluster_selection_method='eom', prediction_data=True, min_samples=5)
topic_model = BERTopic(hdbscan_model=hdbscan_model)
topics, probs = topic_model.fit_transform(docs)
```
!!! note "Note"
Although this will lower outliers found in the data, this might force outliers to be put into topics where they do not belong. So make
sure to strike a balance between keeping noise and reducing outliers.
Second, after training our BERTopic model, we can assign outliers to topics by making use of the `.reduce_outliers` function in BERTopic. An advantage of using this approach is that there are four built in strategies one can choose for reducing outliers. Moreover, this technique allows the user to experiment with reducing outliers across a number of strategies and parameters without actually having to re-train the topic model each time. You can learn more about the `.reduce_outlier` function [here](https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html). The following is a minimal example of how to use this function:
```python
from bertopic import BERTopic
# Train your BERTopic model
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)
# Reduce outliers
new_topics = topic_model.reduce_outliers(docs, topics)
```
Third, we can replace HDBSCAN with any other clustering algorithm that we want. So we can choose a clustering algorithm, like k-Means, that
does not produce any outliers at all. Using k-Means instead of HDBSCAN is straightforward:
```python
from bertopic import BERTopic
from sklearn.cluster import KMeans
cluster_model = KMeans(n_clusters=50)
topic_model = BERTopic(hdbscan_model=cluster_model)
```
## **How do I remove stop words?**
At times, stop words might end up in our topic representations. This is something we typically want to avoid as they contribute little to the interpretation of the topics. However, removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context to create accurate embeddings.
Instead, we can use the `CountVectorizer` to preprocess our documents **after** having generated embeddings and clustered
our documents. I have found almost no disadvantages to using the `CountVectorizer` to remove stop words and
it is something I would strongly advise to try out:
```python
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)
```
We can also use the `ClassTfidfTransformer` to reduce the impact of frequent words. The result is very similar to explicitly removing stop words but this process does this automatically:
```python
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
topic_model = BERTopic(ctfidf_model=ctfidf_model)
```
## **How can I speed up BERTopic?**
You can speed up BERTopic by either generating your embeddings beforehand or by
setting `calculate_probabilities` to False. Calculating the probabilities is quite expensive and can significantly increase the computation time. Thus, only use it if you do not mind waiting a bit before the model is done running or if you have less than a couple of hundred thousand documents.
Also, make sure to use a GPU when extracting the sentence/document embeddings. Transformer models typically require a GPU and using only a CPU can slow down computation time quite a lot. However, if you do not have access to a GPU, looking into quantization might help.
Lastly, it is also possible to speed up BERTopic with [cuML's](https://rapids.ai/start.html#rapids-release-selector) GPU acceleration of UMAP and HDBSCAN:
```python
from bertopic import BERTopic
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP
# Create instances of GPU-accelerated UMAP and HDBSCAN
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)
# Pass the above models to be used in BERTopic
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)
```
## **I am facing memory issues. Help!**
There are several ways to perform computation with large datasets:
* First, you can set `low_memory` to True when instantiating BERTopic.
This may prevent blowing up the memory in UMAP.
* Second, setting `calculate_probabilities` to False when instantiating BERTopic prevents a huge document-topic
probability matrix from being created. Moreover, HDBSCAN is quite slow when it tries to calculate probabilities on large datasets.
* Third, you can set the minimum frequency of words in the CountVectorizer class to reduce the size of the resulting
sparse c-TF-IDF matrix. You can do this as follows:
```python
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english", min_df=10)
topic_model = BERTopic(vectorizer_model=vectorizer_model)
```
The [min_df](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
parameter is used to indicate the minimum frequency of words. Setting this value larger than 1 can significantly reduce memory.
* Fourth, you can use <a href="/BERTopic/getting_started/online/online.html">online topic modeling</a> instead to use BERTopic on big data by training the model in chunks
If the problem persists, then this could be an issue related to your available memory. The processing of millions of documents is quite computationally expensive and sufficient RAM is necessary.
## **I have only a few topics, how do I increase them?**
There are several reasons why your topic model may result in only a few topics:
* First, you might only have a few documents (~1000). This makes it very difficult to properly
extract topics due to the little amount of data available. Increasing the number of documents
might solve your issues.
* Second, `min_topic_size` might be simply too large for your number of documents. If you decrease
the minimum size of topics, then you are much more likely to increase the number of topics generated.
You could also decrease the `n_neighbors` parameter used in `UMAP` if this does not work.
* Third, although this does not happen very often, there simply aren't that many topics to be found
in your documents. You can often see this when you have many `-1` topics, which is not a topic
but a category of outliers.
## **I have too many topics, how do I decrease them?**
If you have a large dataset, then it is possible to generate thousands of topics. Especially with large datasets, there is a good chance they contain many small topics. In practice, you might want a few hundred topics at most to interpret them nicely.
There are a few ways of decreasing the number of generated topics:
* First, we can set the `min_topic_size` in the BERTopic initialization much higher (e.g., 300) to make sure that those small clusters will not be generated. This is an HDBSCAN parameter that specifies the minimum number of documents needed in a cluster. More documents in a cluster mean fewer topics will be generated.
* Second, you can create a custom UMAP model and set `n_neighbors` much higher than the default 15 (e.g., 200). This also prevents those micro clusters to be generated as it will need many neighboring documents to create a cluster.
* Third, we can set `nr_topics` to a value that seems logical to the user. Do note that topics are forced
to merge which might result in a lower quality of topics. In practice, I would advise using
`nr_topic="auto"` as that will merge topics that are very similar. Dissimilar topics will
therefore remain separated.
## **How do I calculate the probabilities of all topics in a document?**
Although it is possible to calculate all the probabilities, the process of doing so is quite computationally
inefficient and might significantly increase the computation time. To prevent this, the probabilities are
not calculated as a default. To calculate them, you will have to set `calculate_probabilities` to True:
```python
from bertopic import BERTopic
topic_model = BERTopic(calculate_probabilities=True)
topics, probs = topic_model.fit_transform(docs)
```
!!! note
The `calculate_probabilities` parameter is only used when using HDBSCAN or cuML's HDBSCAN model. In other words, this will not work when using a model other than HDBSCAN. Instead, we can approximate the topic distributions across all documents with [`.approximate_distribution`](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html).
## **Numpy gives me an error when running BERTopic**
With the release of Numpy 1.20.0, there have been significant issues with using that version (and previous ones) due to compilation issues and pypi.
This is a known issue with the order of installation using pypi. You can find more details about this issue
[here](https://github.com/lmcinnes/umap/issues/567) and [here](https://github.com/scikit-learn-contrib/hdbscan/issues/457).
I would suggest doing one of the following:
* Install the newest version from BERTopic (>= v0.5).
* You can install hdbscan with `pip install hdbscan --no-cache-dir --no-binary :all: --no-build-isolation` which might resolve the issue
* Install BERTopic in a fresh environment using these steps.
## **How can I run BERTopic without an internet connection?**
The great thing about using sentence-transformers is that it searches automatically for an embedding model locally.
If it cannot find one, it will download the pre-trained model from its servers.
Make sure that you set the correct path for sentence-transformers to work. You can find a bit more about that
[here](https://github.com/UKPLab/sentence-transformers/issues/888).
You can download the corresponding model [here](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/)
and unzip it. Then, simply use the following to create your embedding model:
```python
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('path/to/unzipped/model')
```
Then, pass it to BERTopic:
```python
from bertopic import BERTopic
topic_model = BERTopic(embedding_model=embedding_model)
```
## **Can I use the GPU to speed up the model?**
Yes. The GPU is automatically used when you use a SentenceTransformer or Flair embedding model. Using
a CPU would then definitely slow things down. However, you can use other embeddings like TF-IDF or Doc2Vec
embeddings in BERTopic which do not depend on GPU acceleration.
You can use [cuML](https://rapids.ai/start.html#rapids-release-selector) to speed up both
UMAP and HDBSCAN through GPU acceleration:
```python
from bertopic import BERTopic
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP
# Create instances of GPU-accelerated UMAP and HDBSCAN
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)
# Pass the above models to be used in BERTopic
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)
topics, probs = topic_model.fit_transform(docs)
```
Depending on the embeddings you are using, you might want to normalize them first to force a cosine-related distance metric in UMAP:
```python
from cuml.preprocessing import normalize
embeddings = normalize(embeddings)
```
## **How can I use BERTopic with Chinese documents?**
Currently, CountVectorizer tokenizes text by splitting whitespace which does not work for Chinese.
To get it to work, you will have to create a custom `CountVectorizer` with `jieba`:
```python
from sklearn.feature_extraction.text import CountVectorizer
import jieba
def tokenize_zh(text):
words = jieba.lcut(text)
return words
vectorizer = CountVectorizer(tokenizer=tokenize_zh)
```
Next, we pass our custom vectorizer to BERTopic and create our topic model:
```python
from bertopic import BERTopic
topic_model = BERTopic(embedding_model=model, verbose=True, vectorizer_model=vectorizer)
topics, _ = topic_model.fit_transform(docs, embeddings=embeddings)
```
## **Why does it take so long to import BERTopic?**
The main culprit here seems to be UMAP. After running tests with [Tuna](https://github.com/nschloe/tuna) we
can see that most of the resources when importing BERTopic can be dedicated to UMAP:
<img src="img/tuna.png" />
Unfortunately, there currently is no fix for this issue. The most recent ticket regarding this
issue can be found [here](https://github.com/lmcinnes/umap/issues/631).
## **Should I preprocess the data?**
No. By using document embeddings there is typically no need to preprocess the data as all parts of a document
are important in understanding the general topic of the document. Although this holds in 99% of cases, if you
have data that contains a lot of noise, for example, HTML-tags, then it would be best to remove them. HTML-tags
typically do not contribute to the meaning of a document and should therefore be removed. However, if you apply
topic modeling to HTML-code to extract topics of code, then it becomes important.
## **I run into issues running on Apple Silicon. What should I do?**
Apple Silicon chips (M1 & M2) are based on `arm64` (aka [`AArch64`](https://apple.stackexchange.com/questions/451238/is-m1-chip-aarch64-or-amd64), not to be confused with `amd64`/`x86_64`). There are known issues with upstream dependencies for this architecture, for example [numba](https://github.com/numba/numba/issues/5520). You may not always run into this issue, depending on the extras that you need.
One possible solution is to use [VS Code Dev Containers](https://code.visualstudio.com/docs/devcontainers/containers), which allow you to setup a Linux-based environment. To run BERTopic effectively you need to be aware of two things:
1. Make sure to use a Docker image specifically built for arm64
2. Make sure to use a *volume* instead of a *bind-mount*
️ the latter significantly reduces disk I/O
Using the pre-configured [Data Science Dev Containers](https://github.com/b-data/data-science-devcontainers) makes sure these setting are optimized. To start using them, do the following:
* Install and run Docker
* Clone repository [data-science-devcontainers](https://github.com/b-data/data-science-devcontainers)
* Open VS Code, build the `Python base` or `Python scipy` container and start working
️ Change `PYTHON_VERSION` to `3.11` in the respective `devcontainer.json` to work with the latest patch release of Python 3.11
* Note that data is persisted in the container
* When using an unmodified `devcontainer.json`: Work in `/home/vscode`
👉 This is the *home directory* of user `vscode`
* Python packages are installed to the home directory by default
👉 This is due to env variable `PIP_USER=1`
* Note that the directory `/workspaces` is also persisted
### **Do these Data Science Dev Containers support GPU acceleration?**
Yes, but only on Linux and Windows.
The CUDA-enabled variants require the following in addition to Docker:
* NVIDIA GPU
* NVIDIA driver
* Linux: [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
* Windows: [GPU support in Docker Desktop](https://docs.docker.com/desktop/gpu/)
️ The host running the GPU accelerated Dev Containers only requires the NVIDIA driver, the CUDA toolkit does not have to be installed.
See the [CUDA Version Matrix](https://github.com/b-data/jupyterlab-python-docker-stack/blob/main/CUDA_VERSION_MATRIX.md) regarding Ubuntu/CUDA/Python versions and recommended NVIDIA drivers.
@@ -0,0 +1,408 @@
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing) - Overview of Best Practices
Through the nature of BERTopic, its modularity, many variations of the topic modeling technique is possible. However, during the development and through the usage of the package, a set of best practices have been developed that generally lead to great results.
The following are a number of steps, parameters, and settings that you can use that will generally improve the quality of the resulting topics. In other words, after going through the quick start and getting a feeling for the API these steps should get you to the next level of performance.
!!! Note
Although these are called *best practices*, it does not necessarily mean that they work across all use cases perfectly. The underlying modular nature of BERTopic is meant to take different use cases into account. After going through these practices it is advised to fine-tune wherever necessary.
To showcase how these "best practices" work, we will go through an example dataset and apply all practices to it.
## **Data**
For this example, we will use a dataset containing abstracts and metadata from [ArXiv articles](https://huggingface.co/datasets/arxiv_dataset).
```python
from datasets import load_dataset
dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]
# Extract abstracts to train on and corresponding titles
abstracts = dataset["abstract"]
titles = dataset["title"]
```
!!! Tip "Sentence Splitter"
Whenever you have large documents, you typically want to split them up into either paragraphs or sentences. A nice way to do so is by using NLTK's sentence splitter which is nothing more than:
```python
from nltk.tokenize import sent_tokenize, word_tokenize
sentences = [sent_tokenize(abstract) for abstract in abstracts]
sentences = [sentence for doc in sentences for sentence in doc]
```
## **Pre-calculate Embeddings**
After having created our data, namely `abstracts`, we can dive into the very first best practice, **pre-calculating embeddings**.
BERTopic works by converting documents into numerical values, called embeddings. This process can be very costly, especially if we want to iterate over parameters. Instead, we can calculate those embeddings once and feed them to BERTopic to skip calculating embeddings each time.
```python
from sentence_transformers import SentenceTransformer
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)
```
!!! Tip
New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look.
## **Preventing Stochastic Behavior**
In BERTopic, we generally use a dimensionality reduction algorithm to reduce the size of the embeddings. This is done to prevent the [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) to a certain degree.
As a default, this is done with [UMAP](https://github.com/lmcinnes/umap) which is an incredible algorithm for reducing dimensional space. However, by default, it shows stochastic behavior which creates different results each time you run it. To prevent that, we will need to set a `random_state` of the model before passing it to BERTopic.
As a result, we can now fully reproduce the results each time we run the model.
```python
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
```
## **Controlling Number of Topics**
There is a parameter to control the number of topics, namely `nr_topics`. This parameter, however, merges topics **after** they have been created. It is a parameter that supports creating a fixed number of topics.
However, it is advised to control the number of topics through the cluster model which is by default HDBSCAN. HDBSCAN has a parameter, namely `min_cluster_size` that indirectly controls the number of topics that will be created.
A higher `min_cluster_size` will generate fewer topics and a lower `min_cluster_size` will generate more topics.
Here, we will go with `min_cluster_size=150` to prevent too many micro-clusters from being created:
```python
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
```
## **Improving Default Representation**
The default representation of topics is calculated through [c-TF-IDF](https://maartengr.github.io/BERTopic/algorithm/algorithm.html#5-topic-representation). However, c-TF-IDF is powered by the [CountVectorizer](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html) which converts text into tokens. Using the CountVectorizer, we can do a number of things:
* Remove stopwords
* Ignore infrequent words
* Increase the n-gram range
In other words, we can preprocess the topic representations **after** documents are assigned to topics. This will not influence the clustering process in any way.
Here, we will ignore English stopwords and infrequent words. Moreover, by increasing the n-gram range we will consider topic representations that are made up of one or two words.
```python
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
```
## **Additional Representations**
Previously, we have tuned the default representation but there are quite a number of [other topic representations](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html) in BERTopic that we can choose from. From [KeyBERTInspired](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#keybertinspired) and [PartOfSpeech](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#partofspeech), to [OpenAI's ChatGPT](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#chatgpt) and [open-source](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#langchain) alternatives, many representations are possible.
In BERTopic, you can model many different topic representations simultaneously to test them out and get different perspectives of topic descriptions. This is called [multi-aspect](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) topic modeling.
Here, we will demonstrate a number of interesting and useful representations in BERTopic:
* KeyBERTInspired
* A method that derives inspiration from how KeyBERT works
* PartOfSpeech
* Using SpaCy's POS tagging to extract words
* MaximalMarginalRelevance
* Diversify the topic words
* OpenAI
* Use ChatGPT to label our topics
```python
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
# KeyBERT
keybert_model = KeyBERTInspired()
# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")
# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)
# GPT-3.5
client = openai.OpenAI(api_key="sk-...")
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, prompt=prompt)
# All representation models
representation_model = {
"KeyBERT": keybert_model,
# "OpenAI": openai_model, # Uncomment if you will use OpenAI
"MMR": mmr_model,
"POS": pos_model
}
```
## **Training**
Now that we have a set of best practices, we can use them in our training loop. Here, several different representations, keywords and labels for our topics will be created. If you want to iterate over the topic model it is advised to use the pre-calculated embeddings as that significantly speeds up training.
```python
from bertopic import BERTopic
topic_model = BERTopic(
# Pipeline models
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
representation_model=representation_model,
# Hyperparameters
top_n_words=10,
verbose=True
)
# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)
# Show topics
topic_model.get_topic_info()
```
To get all representations for a single topic, we simply run the following:
```python
>>> topic_model.get_topic(1, full=True)
{'Main': [('adversarial', 0.028838938990764302),
('attacks', 0.021726302042463556),
('attack', 0.016803574415028524),
('robustness', 0.013046135743326167),
('adversarial examples', 0.01151254557995679),
('examples', 0.009920962487998853),
('perturbations', 0.009053305826870773),
('adversarial attacks', 0.008747627064844006),
('malware', 0.007675131707700338),
('defense', 0.007365955840313783)],
'KeyBERT': [('adversarial training', 0.76427937),
('adversarial attack', 0.74271905),
('vulnerable adversarial', 0.73302543),
('adversarial', 0.7311052),
('adversarial examples', 0.7179245),
('adversarial attacks', 0.7082),
('adversarially', 0.7005141),
('adversarial robustness', 0.69911957),
('adversarial perturbations', 0.6588783),
('adversary', 0.4467769)],
'OpenAI': [('Adversarial attacks and defense', 1)],
'MMR': [('adversarial', 0.028838938990764302),
('attacks', 0.021726302042463556),
('attack', 0.016803574415028524),
('robustness', 0.013046135743326167),
('adversarial examples', 0.01151254557995679),
('examples', 0.009920962487998853),
('perturbations', 0.009053305826870773),
('adversarial attacks', 0.008747627064844006),
('malware', 0.007675131707700338),
('defense', 0.007365955840313783)],
'POS': [('adversarial', 0.028838938990764302),
('attacks', 0.021726302042463556),
('attack', 0.016803574415028524),
('robustness', 0.013046135743326167),
('adversarial examples', 0.01151254557995679),
('examples', 0.009920962487998853),
('perturbations', 0.009053305826870773),
('adversarial attacks', 0.008747627064844006),
('malware', 0.007675131707700338),
('defense', 0.007365955840313783)]}
```
**NOTE**: The labels generated by OpenAI's **ChatGPT** are especially interesting to use throughout your model. Below, we will go into more detail how to set that as a custom label.
!!! Tip "Parameters"
If you would like to return the topic-document probability matrix, then it is advised to use `calculate_probabilities=True`. Do note that this can significantly slow down training. To speed it up, use [cuML's HDBSCAN](https://maartengr.github.io/BERTopic/getting_started/clustering/clustering.html#cuml-hdbscan) instead. You could also approximate the topic-document probability matrix with `.approximate_distribution` which will be discussed later.
## **(Custom) Labels**
The default label of each topic are the top 3 words in each topic combined with an underscore between them.
This, of course, might not be the best label that you can think of for a certain topic. Instead, we can use `.set_topic_labels` to manually label all or certain topics.
We can also use `.set_topic_labels` to use one of the other topic representations that we had before, like `KeyBERTInspired` or even `OpenAI`.
```python
# Label the topics yourself
topic_model.set_topic_labels({1: "Space Travel", 7: "Religion"})
# or use one of the other topic representations, like KeyBERTInspired
keybert_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in topic_model.topic_aspects_["KeyBERT"].items()}
topic_model.set_topic_labels(keybert_topic_labels)
# or ChatGPT's labels
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)
```
Now that we have set the updated topic labels, we can access them with the many functions used throughout BERTopic. Most notably, you can show the updated labels in visualizations with the `custom_labels=True` parameters.
If we were to run `topic_model.get_topic_info()` it will now include the column `CustomName`. That is the custom label that we just created for each topic.
## **Topic-Document Distribution**
If using `calculate_probabilities=True` is not possible, then you can [approximate the topic-document distributions](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html) using `.approximate_distribution`. It is a fast and flexible method for creating different topic-document distributions.
```python
# `topic_distr` contains the distribution of topics in each document
topic_distr, _ = topic_model.approximate_distribution(abstracts, window=8, stride=4)
```
Next, lets take a look at a specific abstract and see how the topic distribution was extracted:
```python
# Visualize the topic-document distribution for a single document
topic_model.visualize_distribution(topic_distr[abstract_id], custom_labels=True)
```
It seems to have extracted a number of topics that are relevant and shows the distributions of these topics across the abstract. We can go one step further and visualize them on a token-level:
```python
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(abstracts[abstract_id], calculate_tokens=True)
# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(abstracts[abstract_id], topic_token_distr[0])
df
```
!!! Tip "use_embedding_model"
As a default, we compare the c-TF-IDF calculations between the token sets and all topics. Due to its bag-of-word representation, this is quite fast. However, you might want to use the selected embedding_model instead to do this comparison. Do note that due to the many token sets, it is often computationally quite a bit slower:
```python
topic_distr, _ = topic_model.approximate_distribution(docs, use_embedding_model=True)
```
## **Outlier Reduction**
By default, HDBSCAN generates outliers which is a helpful mechanic in creating accurate topic representations. However, you might want to assign every single document to a topic. We can use `.reduce_outliers` to map some or all outliers to a topic:
```python
# Reduce outliers
new_topics = topic_model.reduce_outliers(abstracts, topics)
# Reduce outliers with pre-calculate embeddings instead
new_topics = topic_model.reduce_outliers(abstracts, topics, strategy="embeddings", embeddings=embeddings)
```
!!! Note "Update Topics with Outlier Reduction"
After having generated updated topic assignments, we can pass them to BERTopic in order to update the topic representations:
```python
topic_model.update_topics(docs, topics=new_topics)
```
It is important to realize that updating the topics this way may lead to errors if topic reduction or topic merging techniques are used afterwards. The reason for this is that when you assign a -1 document to topic 1 and another -1 document to topic 2, it is unclear how you map the -1 documents. Is it matched to topic 1 or 2.
## **Visualize Topics**
With visualizations, we are closing into the realm of subjective "best practices". These are things that I generally do because I like the representations but your experience might differ.
Having said that, there are two visualizations that are my go-to when visualizing the topics themselves:
* `topic_model.visualize_topics()`
* `topic_model.visualize_hierarchy()`
```python
# Visualize topics with custom labels
topic_model.visualize_topics(custom_labels=True)
# Visualize hierarchy with custom labels
topic_model.visualize_hierarchy(custom_labels=True)
```
## **Visualize Documents**
When visualizing documents, it helps to have embedded the documents beforehand to speed up computation. Fortunately, we have already done that as a "best practice".
Visualizing documents in 2-dimensional space helps in understanding the underlying structure of the documents and topics.
```python
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
```
The following plot is **interactive** which means that you can zoom in, double click on a label to only see that one and generally interact with the plot:
```python
# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
# NOTE: You can hide the hover with `hide_document_hover=True` which is especially helpful if you have a large dataset
# NOTE: You can also hide the annotations with `hide_annotations=True` which is helpful to see the larger structure
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)
```
!!! Note "2-dimensional space"
Although visualizing the documents in 2-dimensional gives an idea of their underlying structure, there is a risk involved.
Visualizing the documents in 2-dimensional space means that we have lost significant information since the original embeddings were more than 384 dimensions. Condensing all that information in 2 dimensions is simply not possible. In other words, it is merely an **approximation**, albeit quite an accurate one.
## **Serialization**
When saving a BERTopic model, there are several ways in doing so. You can either save the entire model with `pickle`, `pytorch`, or `safetensors`.
Personally, I would advise going with `safetensors` whenever possible. The reason for this is that the format allows for a very small topic model to be saved and shared.
When saving a model with `safetensors`, it skips over saving the dimensionality reduction and clustering models. The `.transform` function will still work without these models but instead assign topics based on the similarity between document embeddings and the topic embeddings.
As a result, the `.transform` step might give different results but it is generally worth it considering the smaller and significantly faster model.
```python
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("my_model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)
```
!!! Note "Embedding Model"
Using `safetensors`, we are not saving the underlying embedding model but merely a pointer to the model. For example, in the above example we are saving the string `"sentence-transformers/all-MiniLM-L6-v2"` so that we can load in the embedding model alongside the topic model.
This currently only works if you are using a sentence transformer model. If you are using a different model, you can load it in when loading the topic model like this:
```python
from sentence_transformers import SentenceTransformer
# Define embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Load model and add embedding model
loaded_model = BERTopic.load("my_model_dir", embedding_model=embedding_model)
```
## **Inference**
To speed up the inference, we can leverage a "best practice" that we used before, namely serialization. When you save a model as `safetensors` and then load it in, we are removing the dimensionality reduction and clustering steps from the pipeline.
Instead, the assignment of topics is done through cosine similarity of document embeddings and topic embeddings. This speeds up inferences significantly.
To show its effect, let's start by disabling the logger:
```python
from bertopic._utils import MyLogger
logger = MyLogger()
logger.configure("ERROR")
loaded_model.verbose = False
topic_model.verbose = False
```
Then, we run inference on both the loaded model and the non-loaded model:
```python
>>> %timeit loaded_model.transform(abstracts[:100])
343 ms ± 31.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
```
```python
>>> %timeit topic_model.transform(abstracts[:100])
1.37 s ± 166 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
```
Based on the above, the `loaded_model` seems to be quite a bit faster for inference than the original `topic_model`.
@@ -0,0 +1,121 @@
After reducing the dimensionality of our input embeddings, we need to cluster them into groups of similar embeddings to extract our topics.
This process of clustering is quite important because the more performant our clustering technique the more accurate our topic representations are.
In BERTopic, we typically use HDBSCAN as it is quite capable of capturing structures with different densities. However, there is not one perfect
clustering model and you might want to be using something entirely different for your use case. Moreover, what if a new state-of-the-art model
is released tomorrow? We would like to be able to use that in BERTopic, right? Since BERTopic assumes some independence among steps, we can allow for this modularity:
<figure markdown>
![Image title](clustering.svg)
<figcaption></figcaption>
</figure>
As a result, the `hdbscan_model` parameter in BERTopic now allows for a variety of clustering models. To do so, the class should have
the following attributes:
* `.fit(X)`
* A function that can be used to fit the model
* `.predict(X)`
* A predict function that transforms the input to cluster labels
* `.labels_`
* The labels after fitting the model
In other words, it should have the following structure:
```python
class ClusterModel:
def fit(self, X):
self.labels_ = None
return self
def predict(self, X):
return X
```
In this section, we will go through several examples of clustering algorithms and how they can be implemented.
## **HDBSCAN**
As a default, BERTopic uses HDBSCAN to perform its clustering. To use a HDBSCAN model with custom parameters,
we simply define it and pass it to BERTopic:
```python
from bertopic import BERTopic
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(hdbscan_model=hdbscan_model)
```
Here, we can define any parameters in HDBSCAN to optimize for the best performance based on whatever validation metrics you are using.
## **k-Means**
Although HDBSCAN works quite well in BERTopic and is typically advised, you might want to be using k-Means instead.
It allows you to select how many clusters you would like and forces every single point to be in a cluster. Therefore, no
outliers will be created. This also has disadvantages. When you force every single point in a cluster, it will mean
that the cluster is highly likely to contain noise which can hurt the topic representations. As a small tip, using
the `vectorizer_model=CountVectorizer(stop_words="english")` helps quite a bit to then improve the topic representation.
Having said that, using k-Means is quite straightforward:
```python
from bertopic import BERTopic
from sklearn.cluster import KMeans
cluster_model = KMeans(n_clusters=50)
topic_model = BERTopic(hdbscan_model=cluster_model)
```
!!! note
As you might have noticed, the `cluster_model` is passed to `hdbscan_model` which might be a bit confusing considering
you are not passing an HDBSCAN model. For now, the name of the parameter is kept the same to adhere to the current
state of the API. Changing the name could lead to deprecation issues, which I want to prevent as much as possible.
## **Agglomerative Clustering**
Like k-Means, there are a bunch more clustering algorithms in `sklearn` that you can be using. Some of these models do
not have a `.predict()` method but still can be used in BERTopic. However, using BERTopic's `.transform()` function
will then give errors.
Here, we will demonstrate Agglomerative Clustering:
```python
from bertopic import BERTopic
from sklearn.cluster import AgglomerativeClustering
cluster_model = AgglomerativeClustering(n_clusters=50)
topic_model = BERTopic(hdbscan_model=cluster_model)
```
## **cuML HDBSCAN**
Although the original HDBSCAN implementation is an amazing technique, it may have difficulty handling large amounts of data. Instead,
we can use [cuML](https://rapids.ai/start.html#rapids-release-selector) to speed up HDBSCAN through GPU acceleration:
```python
from bertopic import BERTopic
from cuml.cluster import HDBSCAN
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)
topic_model = BERTopic(hdbscan_model=hdbscan_model)
```
The great thing about using cuML's HDBSCAN implementation is that it supports many features of the original implementation. In other words,
`calculate_probabilities=True` also works!
!!! note
As of the v0.13 release, it is not yet possible to calculate the topic-document probability matrix for unseen data (i.e., `.transform`) using cuML's HDBSCAN.
However, it is still possible to calculate the topic-document probability matrix for the data on which the model was trained (i.e., `.fit` and `.fit_transform`).
!!! note
If you want to install cuML together with BERTopic using Google Colab, you can run the following code:
```bash
!pip install bertopic
!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install --upgrade cupy-cuda11x -f https://pip.cupy.dev/aarch64
```
@@ -0,0 +1,53 @@
<svg width="445" height="268" viewBox="0 0 445 268" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="132" y="230" width="118" height="38" fill="#64B5F6"/>
<rect x="224" y="220" width="20" height="8" fill="#64B5F6"/>
<rect x="196" y="220" width="20" height="8" fill="#64B5F6"/>
<rect x="168" y="220" width="20" height="8" fill="#64B5F6"/>
<rect x="140" y="220" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="158.256" y="257.939">SBERT</tspan></text>
<rect x="132" y="190" width="118" height="38" fill="#E57373"/>
<rect x="224" y="180" width="20" height="8" fill="#E57373"/>
<rect x="196" y="180" width="20" height="8" fill="#E57373"/>
<rect x="168" y="180" width="20" height="8" fill="#E57373"/>
<rect x="140" y="180" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="161.254" y="217.939">UMAP</tspan></text>
<rect y="150" width="118" height="38" fill="#4DB6AC"/>
<rect x="92" y="140" width="20" height="8" fill="#4DB6AC"/>
<rect x="64" y="140" width="20" height="8" fill="#4DB6AC"/>
<rect x="36" y="140" width="20" height="8" fill="#4DB6AC"/>
<rect x="8" y="140" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="9.3418" y="177.939">HDBSCAN</tspan></text>
<rect x="132" y="90" width="118" height="38" fill="#FFD54F"/>
<rect x="224" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="196" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="168" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="140" y="80" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="138.346" y="114.161">CountVectorizer</tspan></text>
<rect x="132" y="50" width="118" height="38" fill="#90A4AE"/>
<rect x="224" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="196" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="168" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="140" y="40" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="146.938" y="77.9395">c-TF-IDF</tspan></text>
<rect x="132" y="10" width="118" height="38" fill="#3F51B5"/>
<rect x="224" width="20" height="8" fill="#3F51B5"/>
<rect x="196" width="20" height="8" fill="#3F51B5"/>
<rect x="168" width="20" height="8" fill="#3F51B5"/>
<rect x="140" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="161.065" y="25.0576">Optional&#10;</tspan><tspan x="150.271" y="42.0576">Fine-tuning</tspan></text>
<rect x="132" y="150" width="118" height="38" fill="#4DB6AC"/>
<rect x="224" y="140" width="20" height="8" fill="#4DB6AC"/>
<rect x="196" y="140" width="20" height="8" fill="#4DB6AC"/>
<rect x="168" y="140" width="20" height="8" fill="#4DB6AC"/>
<rect x="140" y="140" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="148.246" y="177.939">k-Means</tspan></text>
<rect x="327" y="150" width="118" height="38" fill="#4DB6AC"/>
<rect x="419" y="140" width="20" height="8" fill="#4DB6AC"/>
<rect x="391" y="140" width="20" height="8" fill="#4DB6AC"/>
<rect x="363" y="140" width="20" height="8" fill="#4DB6AC"/>
<rect x="335" y="140" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="351.709" y="177.939">BIRCH</tspan></text>
<circle cx="266.5" cy="168.5" r="5.5" fill="black"/>
<circle cx="285.5" cy="168.5" r="5.5" fill="black"/>
<circle cx="307.5" cy="168.5" r="5.5" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 4.2 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 84 KiB

@@ -0,0 +1,82 @@
# c-TF-IDF
In BERTopic, in order to get an accurate representation of the topics from our bag-of-words matrix, TF-IDF was adjusted to work on a cluster/categorical/topic level instead of a document level. This adjusted TF-IDF representation is called **c-TF-IDF** and takes into account what makes the documents in one cluster different from documents in another cluster:
<img class="w-6/12" src="../../algorithm/c-TF-IDF.svg">
<br>
Each cluster is converted to a single document instead of a set of documents. Then, we extract the frequency of word `x` in class `c`, where `c` refers to the cluster we created before. This results in our class-based `tf` representation. This representation is L1-normalized to account for the differences in topic sizes.
<br><br>
Then, we take the logarithm of one plus the average number of words per class `A` divided by the frequency of word `x` across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based `idf` representation. Like with the classic TF-IDF, we then multiply `tf` with `idf` to get the importance score per word in each class. In other words, the classical TF-IDF procedure is **not** used here but a modified version of the algorithm that allows for a much better representation.
Since the topic representation is somewhat independent of the clustering step, we can change how the c-TF-IDF representation will look like. This can be in the form of parameter tuning, different weighting schemes, or using a diversity metric on top of it. This allows for some modularity concerning the weighting scheme:
<figure markdown>
![Image title](ctfidf.svg)
<figcaption></figcaption>
</figure>
This class-based TF-IDF representation is enabled by default in BERTopic. However, we can explicitly pass it to BERTopic through the `ctfidf_model` allowing for parameter tuning and the customization of the topic extraction technique:
```python
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
ctfidf_model = ClassTfidfTransformer()
topic_model = BERTopic(ctfidf_model=ctfidf_model )
```
## **Parameters**
There are two parameters worth exploring in the `ClassTfidfTransformer`, namely `bm25_weighting` and `reduce_frequent_words`.
### bm25_weighting
The `bm25_weighting` is a boolean parameter that indicates whether a class-based BM-25 weighting measure is used instead of the default method as defined in the formula at the beginning of this page.
Instead of using the following weighting scheme:
<img class="w-6/12" src="idf.svg">
the class-based BM-25 weighting is used instead:
<img class="w-6/12" src="bm25.svg">
At smaller datasets, this variant can be more robust to stop words that appear in your data. It can be enabled as follows:
```python
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)
topic_model = BERTopic(ctfidf_model=ctfidf_model )
```
### reduce_frequent_words
Some words appear quite often in every topic but are generally not considered stop words as found in the `CountVectorizer(stop_words="english")` list. To further reduce these frequent words, we can use `reduce_frequent_words` to take the square root of the term frequency after applying the weighting scheme.
Instead of the default term frequency:
<img class="w-8/12" src="tf.svg">
we take the square root of the term frequency after normalizing the frequency matrix:
<img class="w-8/12" src="tf_reduced.svg">
Although seemingly a small change, it can have quite a large effect on the number of stop words in the resulting topic representations. It can be enabled as follows:
```python
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
topic_model = BERTopic(ctfidf_model=ctfidf_model )
```
!!! tip
Both parameters can be used simultaneously: `ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)`
@@ -0,0 +1,53 @@
<svg width="445" height="248" viewBox="0 0 445 248" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="132" y="210" width="118" height="38" fill="#64B5F6"/>
<rect x="224" y="200" width="20" height="8" fill="#64B5F6"/>
<rect x="196" y="200" width="20" height="8" fill="#64B5F6"/>
<rect x="168" y="200" width="20" height="8" fill="#64B5F6"/>
<rect x="140" y="200" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="158.256" y="237.939">SBERT</tspan></text>
<rect x="132" y="170" width="118" height="38" fill="#E57373"/>
<rect x="224" y="160" width="20" height="8" fill="#E57373"/>
<rect x="196" y="160" width="20" height="8" fill="#E57373"/>
<rect x="168" y="160" width="20" height="8" fill="#E57373"/>
<rect x="140" y="160" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="161.254" y="197.939">UMAP</tspan></text>
<rect x="132" y="130" width="118" height="38" fill="#4DB6AC"/>
<rect x="224" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="196" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="168" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="140" y="120" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="141.342" y="157.939">HDBSCAN</tspan></text>
<rect y="50" width="118" height="38" fill="#90A4AE"/>
<rect x="92" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="64" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="36" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="8" y="40" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="23.1357" y="66.1606">c-TF-IDF +&#10;</tspan><tspan x="40.4521" y="82.1606">BM25</tspan></text>
<rect x="132" y="90" width="118" height="38" fill="#FFD54F"/>
<rect x="224" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="196" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="168" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="140" y="80" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="138.346" y="113.161">CountVectorizer</tspan></text>
<rect x="132" y="50" width="118" height="38" fill="#90A4AE"/>
<rect x="224" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="196" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="168" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="140" y="40" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="146.938" y="77.9395">c-TF-IDF</tspan></text>
<rect x="132" y="10" width="118" height="38" fill="#3F51B5"/>
<rect x="224" width="20" height="8" fill="#3F51B5"/>
<rect x="196" width="20" height="8" fill="#3F51B5"/>
<rect x="168" width="20" height="8" fill="#3F51B5"/>
<rect x="140" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="161.065" y="26.0576">Optional&#10;</tspan><tspan x="150.271" y="43.0576">Fine-tuning</tspan></text>
<rect x="327" y="50" width="118" height="38" fill="#90A4AE"/>
<rect x="419" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="391" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="363" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="335" y="40" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="345.326" y="67.0576">c-TF-IDF + &#10;</tspan><tspan x="336.453" y="84.0576">Normalization</tspan></text>
<circle cx="266.5" cy="68.5" r="5.5" fill="black"/>
<circle cx="285.5" cy="68.5" r="5.5" fill="black"/>
<circle cx="307.5" cy="68.5" r="5.5" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 4.3 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 80 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 32 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 32 KiB

@@ -0,0 +1,18 @@
<svg width="534" height="57" viewBox="0 0 534 57" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect width="534" height="57" fill="white"/>
<rect x="0.5" y="14.5" width="88" height="42" fill="white" stroke="black"/>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="30" y="10.9697">SBERT</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="183" y="10.9697">UMAP</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="313" y="10.9697">HDBSCAN</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="468" y="10.9697">c-TF-IDF</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="9" y="38.7637">Embeddings</tspan></text>
<rect x="142.5" y="14.5" width="105" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="156.094" y="33.7637">Dimensionality &#10;</tspan><tspan x="171.762" y="47.7637">reduction</tspan></text>
<path d="M126.707 33.7071C127.098 33.3166 127.098 32.6834 126.707 32.2929L120.343 25.9289C119.953 25.5384 119.319 25.5384 118.929 25.9289C118.538 26.3195 118.538 26.9526 118.929 27.3431L124.586 33L118.929 38.6569C118.538 39.0474 118.538 39.6805 118.929 40.0711C119.319 40.4616 119.953 40.4616 120.343 40.0711L126.707 33.7071ZM99 34H126V32H99V34Z" fill="black"/>
<rect x="295.5" y="14.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="317" y="38.7637">Clustering</tspan></text>
<path d="M285.707 33.7071C286.098 33.3166 286.098 32.6834 285.707 32.2929L279.343 25.9289C278.953 25.5384 278.319 25.5384 277.929 25.9289C277.538 26.3195 277.538 26.9526 277.929 27.3431L283.586 33L277.929 38.6569C277.538 39.0474 277.538 39.6805 277.929 40.0711C278.319 40.4616 278.953 40.4616 279.343 40.0711L285.707 33.7071ZM258 34H285V32H258V34Z" fill="black"/>
<rect x="442.5" y="14.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="472.404" y="30.7637">Topic &#10;</tspan><tspan x="450.215" y="44.7637">representation</tspan></text>
<path d="M426.707 33.7071C427.098 33.3166 427.098 32.6834 426.707 32.2929L420.343 25.9289C419.953 25.5384 419.319 25.5384 418.929 25.9289C418.538 26.3195 418.538 26.9526 418.929 27.3431L424.586 33L418.929 38.6569C418.538 39.0474 418.538 39.6805 418.929 40.0711C419.319 40.4616 419.953 40.4616 420.343 40.0711L426.707 33.7071ZM399 34H426V32H399V34Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 3.0 KiB

@@ -0,0 +1,138 @@
An important aspect of BERTopic is the dimensionality reduction of the input embeddings. As embeddings are often high in dimensionality, clustering becomes difficult due to the curse of dimensionality.
A solution is to reduce the dimensionality of the embeddings to a workable dimensional space (e.g., 5) for clustering algorithms to work with.
UMAP is used as a default in BERTopic since it can capture both the local and global high-dimensional space in lower dimensions.
However, there are other solutions out there, such as PCA that users might be interested in trying out. Since BERTopic allows assumes some independency between steps, we can
use any other dimensionality reduction algorithm. The image below illustrates this modularity:
<figure markdown>
![Image title](dimensionality.svg)
<figcaption></figcaption>
</figure>
As a result, the `umap_model` parameter in BERTopic now allows for a variety of dimensionality reduction models. To do so, the class should have
the following attributes:
* `.fit(X)`
* A function that can be used to fit the model
* `.transform(X)`
* A transform function that transforms the input to a lower dimensional size
In other words, it should have the following structure:
```python
class DimensionalityReduction:
def fit(self, X):
return self
def transform(self, X):
return X
```
In this section, we will go through several examples of dimensionality reduction techniques and how they can be implemented.
## **UMAP**
As a default, BERTopic uses UMAP to perform its dimensionality reduction. To use a UMAP model with custom parameters,
we simply define it and pass it to BERTopic:
```python
from bertopic import BERTopic
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
topic_model = BERTopic(umap_model=umap_model)
```
Here, we can define any parameters in UMAP to optimize for the best performance based on whatever validation metrics you are using.
## **PCA**
Although UMAP works quite well in BERTopic and is typically advised, you might want to be using PCA instead. It can be faster to train and perform
inference. To use PCA, we can simply import it from `sklearn` and pass it to the `umap_model` parameter:
```python
from bertopic import BERTopic
from sklearn.decomposition import PCA
dim_model = PCA(n_components=5)
topic_model = BERTopic(umap_model=dim_model)
```
As a small note, PCA and k-Means have worked quite well in my experiments and might be interesting to use instead of PCA and HDBSCAN.
!!! note
As you might have noticed, the `dim_model` is passed to `umap_model` which might be a bit confusing considering
you are not passing a UMAP model. For now, the name of the parameter is kept the same to adhere to the current
state of the API. Changing the name could lead to deprecation issues, which I want to prevent as much as possible.
## **Truncated SVD**
Like PCA, there are a bunch more dimensionality reduction techniques in `sklearn` that you can be using. Here, we will demonstrate Truncated SVD
but any model can be used as long as it has both a `.fit()` and `.transform()` method:
```python
from bertopic import BERTopic
from sklearn.decomposition import TruncatedSVD
dim_model = TruncatedSVD(n_components=5)
topic_model = BERTopic(umap_model=dim_model)
```
## **cuML UMAP**
Although the original UMAP implementation is an amazing technique, it may have difficulty handling large amounts of data. Instead,
we can use [cuML](https://rapids.ai/start.html#rapids-release-selector) to speed up UMAP through GPU acceleration:
```python
from bertopic import BERTopic
from cuml.manifold import UMAP
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
topic_model = BERTopic(umap_model=umap_model)
```
!!! note
If you want to install cuML together with BERTopic using Google Colab, you can run the following code:
```bash
!pip install bertopic
!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install --upgrade cupy-cuda11x -f https://pip.cupy.dev/aarch64
```
## **Skip dimensionality reduction**
Although BERTopic applies dimensionality reduction as a default in its pipeline, this is a step that you might want to skip. We generate an "empty" model that simply returns the data pass it to:
```python
from bertopic import BERTopic
from bertopic.dimensionality import BaseDimensionalityReduction
# Fit BERTopic without actually performing any dimensionality reduction
empty_dimensionality_model = BaseDimensionalityReduction()
topic_model = BERTopic(umap_model=empty_dimensionality_model)
```
In other words, we go from this pipeline:
<br>
<div class="svg_image">
--8<-- "docs/getting_started/dim_reduction/default_pipeline.svg"
</div>
<br>
To the following pipeline:
<br>
<div class="svg_image">
--8<-- "docs/getting_started/dim_reduction/no_dimensionality.svg"
</div>
<br>
@@ -0,0 +1,53 @@
<svg width="445" height="278" viewBox="0 0 445 278" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="132" y="240" width="118" height="38" fill="#64B5F6"/>
<rect x="224" y="230" width="20" height="8" fill="#64B5F6"/>
<rect x="196" y="230" width="20" height="8" fill="#64B5F6"/>
<rect x="168" y="230" width="20" height="8" fill="#64B5F6"/>
<rect x="140" y="230" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="158.256" y="267.939">SBERT</tspan></text>
<rect y="200" width="118" height="38" fill="#E57373"/>
<rect x="92" y="190" width="20" height="8" fill="#E57373"/>
<rect x="64" y="190" width="20" height="8" fill="#E57373"/>
<rect x="36" y="190" width="20" height="8" fill="#E57373"/>
<rect x="8" y="190" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="29.2539" y="227.939">UMAP</tspan></text>
<rect x="132" y="200" width="118" height="38" fill="#E57373"/>
<rect x="224" y="190" width="20" height="8" fill="#E57373"/>
<rect x="196" y="190" width="20" height="8" fill="#E57373"/>
<rect x="168" y="190" width="20" height="8" fill="#E57373"/>
<rect x="140" y="190" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="170.902" y="227.939">PCA</tspan></text>
<rect x="327" y="200" width="118" height="38" fill="#E57373"/>
<rect x="419" y="190" width="20" height="8" fill="#E57373"/>
<rect x="391" y="190" width="20" height="8" fill="#E57373"/>
<rect x="363" y="190" width="20" height="8" fill="#E57373"/>
<rect x="335" y="190" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="335.886" y="225.058">TruncatedSVD</tspan></text>
<circle cx="266.5" cy="218.5" r="5.5" fill="black"/>
<circle cx="285.5" cy="218.5" r="5.5" fill="black"/>
<circle cx="307.5" cy="218.5" r="5.5" fill="black"/>
<rect x="132" y="130" width="118" height="38" fill="#4DB6AC"/>
<rect x="224" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="196" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="168" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="140" y="120" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="141.342" y="157.939">HDBSCAN</tspan></text>
<rect x="132" y="90" width="118" height="38" fill="#FFD54F"/>
<rect x="224" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="196" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="168" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="140" y="80" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="138.346" y="114.161">CountVectorizer</tspan></text>
<rect x="132" y="50" width="118" height="38" fill="#90A4AE"/>
<rect x="224" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="196" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="168" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="140" y="40" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="146.938" y="76.9395">c-TF-IDF</tspan></text>
<rect x="132" y="10" width="118" height="38" fill="#3F51B5"/>
<rect x="224" width="20" height="8" fill="#3F51B5"/>
<rect x="196" width="20" height="8" fill="#3F51B5"/>
<rect x="168" width="20" height="8" fill="#3F51B5"/>
<rect x="140" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="161.065" y="27.0576">Optional&#10;</tspan><tspan x="150.271" y="44.0576">Fine-tuning</tspan></text>
</svg>

After

Width:  |  Height:  |  Size: 4.2 KiB

@@ -0,0 +1,14 @@
<svg width="374" height="57" viewBox="0 0 374 57" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect width="374" height="57" fill="white"/>
<rect x="0.5" y="14.5" width="88" height="42" fill="white" stroke="black"/>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="30" y="10.9697">SBERT</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="156" y="10.9697">HDBSCAN</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="308" y="10.9697">c-TF-IDF</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="9" y="38.7637">Embeddings</tspan></text>
<rect x="135.5" y="14.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="157" y="38.7637">Clustering</tspan></text>
<path d="M125.707 33.7071C126.098 33.3166 126.098 32.6834 125.707 32.2929L119.343 25.9289C118.953 25.5384 118.319 25.5384 117.929 25.9289C117.538 26.3195 117.538 26.9526 117.929 27.3431L123.586 33L117.929 38.6569C117.538 39.0474 117.538 39.6805 117.929 40.0711C118.319 40.4616 118.953 40.4616 119.343 40.0711L125.707 33.7071ZM98 34H125V32H98V34Z" fill="black"/>
<rect x="282.5" y="14.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="312.404" y="30.7637">Topic &#10;</tspan><tspan x="290.215" y="44.7637">representation</tspan></text>
<path d="M266.707 33.7071C267.098 33.3166 267.098 32.6834 266.707 32.2929L260.343 25.9289C259.953 25.5384 259.319 25.5384 258.929 25.9289C258.538 26.3195 258.538 26.9526 258.929 27.3431L264.586 33L258.929 38.6569C258.538 39.0474 258.538 39.6805 258.929 40.0711C259.319 40.4616 259.953 40.4616 260.343 40.0711L266.707 33.7071ZM239 34H266V32H239V34Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 2.2 KiB

@@ -0,0 +1,123 @@
<svg width="1173" height="612" viewBox="0 0 1173 612" fill="none" xmlns="http://www.w3.org/2000/svg">
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="35" letter-spacing="0em"><tspan x="505" y="35.894">the</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="35" letter-spacing="0em"><tspan x="567" y="35.894">right</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="35" letter-spacing="0em"><tspan x="655" y="35.894">problem</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="35" letter-spacing="0em"><tspan x="798" y="35.894">is</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="35" letter-spacing="0em"><tspan x="835" y="35.894">difficult</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="35" letter-spacing="0em"><tspan x="372" y="35.894">Solving</tspan></text>
<rect x="362.5" y="0.5" width="604" height="47" stroke="black"/>
<line x1="499.5" y1="2.18557e-08" x2="499.5" y2="47" stroke="black"/>
<line x1="563.5" y1="1" x2="563.5" y2="48" stroke="black"/>
<line x1="649.5" y1="2.18557e-08" x2="649.5" y2="47" stroke="black"/>
<line x1="792.5" y1="2.18557e-08" x2="792.5" y2="47" stroke="black"/>
<line x1="829.5" y1="2.18557e-08" x2="829.5" y2="47" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="348" y="152.204">right </tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="302" y="152.204">the </tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="209" y="152.204">Solving </tspan></text>
<rect x="200.5" y="125.779" width="214" height="35" stroke="black" stroke-dasharray="2 2"/>
<line x1="298.307" y1="125" x2="298.307" y2="160.269" stroke="black" stroke-dasharray="2 2"/>
<line x1="343.333" y1="125.75" x2="343.333" y2="161.02" stroke="black" stroke-dasharray="2 2"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="460" y="152.204">the </tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="507" y="152.204">right</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="572.956" y="151.924">problem</tspan></text>
<rect x="452.5" y="125.779" width="214" height="35" stroke="black" stroke-dasharray="2 2"/>
<line x1="502.307" y1="125" x2="502.307" y2="160.269" stroke="black" stroke-dasharray="2 2"/>
<line x1="564.333" y1="125.75" x2="564.333" y2="161.02" stroke="black" stroke-dasharray="2 2"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="711.405" y="152.204">right </tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="776.956" y="151.924">problem</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="878.405" y="152.204">is</tspan></text>
<rect x="704.5" y="125.779" width="198" height="35" stroke="black" stroke-dasharray="2 2"/>
<line x1="769.409" y1="125" x2="769.409" y2="160.269" stroke="black" stroke-dasharray="2 2"/>
<line x1="872.769" y1="125.75" x2="872.769" y2="161.02" stroke="black" stroke-dasharray="2 2"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="948" y="151.924">problem </tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="1051" y="151.924">is</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="25" letter-spacing="0em"><tspan x="1079" y="151.924">difficult</tspan></text>
<rect x="940.5" y="125.779" width="230.609" height="35" stroke="black" stroke-dasharray="2 2"/>
<line x1="1047.86" y1="125" x2="1047.86" y2="160.269" stroke="black" stroke-dasharray="2 2"/>
<line x1="1075.37" y1="125.75" x2="1075.37" y2="161.02" stroke="black" stroke-dasharray="2 2"/>
<path d="M303.025 114.779C302.903 115.318 303.24 115.853 303.779 115.975L312.556 117.965C313.095 118.087 313.631 117.749 313.753 117.211C313.875 116.672 313.537 116.136 312.998 116.014L305.196 114.246L306.965 106.444C307.087 105.905 306.749 105.369 306.211 105.247C305.672 105.125 305.136 105.463 305.014 106.002L303.025 114.779ZM395.467 56.1541L303.467 114.154L304.533 115.846L396.533 57.8459L395.467 56.1541Z" fill="black"/>
<path d="M1064.33 115.944C1064.85 115.763 1065.13 115.193 1064.94 114.671L1061.98 106.172C1061.8 105.65 1061.23 105.375 1060.71 105.556C1060.19 105.738 1059.91 106.308 1060.1 106.83L1062.73 114.385L1055.17 117.016C1054.65 117.198 1054.37 117.768 1054.56 118.289C1054.74 118.811 1055.31 119.086 1055.83 118.905L1064.33 115.944ZM943.565 57.9003L1063.56 115.9L1064.44 114.1L944.435 56.0997L943.565 57.9003Z" fill="black"/>
<path d="M636.52 351.854C636.992 351.567 637.141 350.952 636.854 350.48L632.174 342.793C631.887 342.321 631.272 342.171 630.8 342.458C630.328 342.746 630.179 343.361 630.466 343.833L634.626 350.666L627.793 354.826C627.321 355.113 627.171 355.728 627.458 356.2C627.746 356.672 628.361 356.821 628.833 356.534L636.52 351.854ZM306.764 271.972L635.764 351.972L636.236 350.028L307.236 270.028L306.764 271.972Z" fill="black"/>
<path d="M660.103 349.995C660.652 349.938 661.052 349.446 660.995 348.897L660.069 339.945C660.012 339.396 659.52 338.996 658.971 339.053C658.422 339.11 658.022 339.601 658.079 340.151L658.902 348.108L650.945 348.931C650.396 348.988 649.996 349.48 650.053 350.029C650.11 350.578 650.601 350.978 651.151 350.921L660.103 349.995ZM563.369 271.776L659.369 349.776L660.631 348.224L564.631 270.224L563.369 271.776Z" fill="black"/>
<path d="M701.011 348.85C700.928 349.396 701.304 349.906 701.85 349.989L710.747 351.343C711.293 351.426 711.803 351.05 711.886 350.504C711.969 349.958 711.594 349.448 711.048 349.365L703.139 348.162L704.343 340.253C704.426 339.707 704.05 339.197 703.504 339.114C702.958 339.031 702.448 339.406 702.365 339.952L701.011 348.85ZM807.407 270.195L701.407 348.195L702.593 349.805L808.593 271.805L807.407 270.195Z" fill="black"/>
<path d="M725.149 350.475C724.859 350.945 725.005 351.561 725.475 351.851L733.133 356.578C733.603 356.868 734.219 356.722 734.51 356.252C734.8 355.782 734.654 355.166 734.184 354.876L727.376 350.674L731.578 343.867C731.868 343.397 731.722 342.78 731.252 342.49C730.782 342.2 730.166 342.346 729.876 342.816L725.149 350.475ZM1063.77 270.027L725.77 350.027L726.23 351.973L1064.23 271.973L1063.77 270.027Z" fill="black"/>
<path d="M561.293 115.707C561.683 116.098 562.317 116.098 562.707 115.707L569.071 109.343C569.462 108.953 569.462 108.319 569.071 107.929C568.681 107.538 568.047 107.538 567.657 107.929L562 113.586L556.343 107.929C555.953 107.538 555.319 107.538 554.929 107.929C554.538 108.319 554.538 108.953 554.929 109.343L561.293 115.707ZM561 57L561 115L563 115L563 57L561 57Z" fill="black"/>
<path d="M806.293 115.707C806.683 116.098 807.317 116.098 807.707 115.707L814.071 109.343C814.462 108.953 814.462 108.319 814.071 107.929C813.681 107.538 813.047 107.538 812.657 107.929L807 113.586L801.343 107.929C800.953 107.538 800.319 107.538 799.929 107.929C799.538 108.319 799.538 108.953 799.929 109.343L806.293 115.707ZM806 57L806 115L808 115L808 57L806 57Z" fill="black"/>
<rect x="240.5" y="238.5" width="127" height="20.8389" stroke="black"/>
<rect x="240.5" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="261.732" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="282.964" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="304.197" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="325.429" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="346.661" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<path d="M303.293 230.707C303.683 231.098 304.317 231.098 304.707 230.707L311.071 224.343C311.462 223.953 311.462 223.319 311.071 222.929C310.681 222.538 310.047 222.538 309.657 222.929L304 228.586L298.343 222.929C297.953 222.538 297.319 222.538 296.929 222.929C296.538 223.319 296.538 223.953 296.929 224.343L303.293 230.707ZM303 172L303 230L305 230L305 172L303 172Z" fill="black"/>
<rect x="498.5" y="238.5" width="127" height="20.8389" stroke="black"/>
<rect x="498.5" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="519.732" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="540.964" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="562.197" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="583.429" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="604.661" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="617.5" y="364.5" width="127" height="20.8389" stroke="black"/>
<rect x="617.5" y="364.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="638.732" y="364.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="659.964" y="364.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="681.197" y="364.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="702.429" y="364.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="723.661" y="364.5" width="20.8389" height="20.8389" stroke="black"/>
<path d="M561.293 230.707C561.683 231.098 562.317 231.098 562.707 230.707L569.071 224.343C569.462 223.953 569.462 223.319 569.071 222.929C568.681 222.538 568.047 222.538 567.657 222.929L562 228.586L556.343 222.929C555.953 222.538 555.319 222.538 554.929 222.929C554.538 223.319 554.538 223.953 554.929 224.343L561.293 230.707ZM561 172L561 230L563 230L563 172L561 172Z" fill="black"/>
<path d="M681.293 460.707C681.683 461.098 682.317 461.098 682.707 460.707L689.071 454.343C689.462 453.953 689.462 453.319 689.071 452.929C688.681 452.538 688.047 452.538 687.657 452.929L682 458.586L676.343 452.929C675.953 452.538 675.319 452.538 674.929 452.929C674.538 453.319 674.538 453.953 674.929 454.343L681.293 460.707ZM681 402L681 460L683 460L683 402L681 402Z" fill="black"/>
<rect x="743.5" y="238.5" width="127" height="20.8389" stroke="black"/>
<rect x="743.5" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="764.732" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="785.964" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="807.197" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="828.429" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="849.661" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<path d="M806.293 230.707C806.683 231.098 807.317 231.098 807.707 230.707L814.071 224.343C814.462 223.953 814.462 223.319 814.071 222.929C813.681 222.538 813.047 222.538 812.657 222.929L807 228.586L801.343 222.929C800.953 222.538 800.319 222.538 799.929 222.929C799.538 223.319 799.538 223.953 799.929 224.343L806.293 230.707ZM806 172L806 230L808 230L808 172L806 172Z" fill="black"/>
<rect x="1000.5" y="238.5" width="127" height="20.8389" stroke="black"/>
<rect x="1000.5" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="1021.73" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="1042.96" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="1064.2" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="1085.43" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<rect x="1106.66" y="238.5" width="20.8389" height="20.8389" stroke="black"/>
<path d="M1063.29 230.707C1063.68 231.098 1064.32 231.098 1064.71 230.707L1071.07 224.343C1071.46 223.953 1071.46 223.319 1071.07 222.929C1070.68 222.538 1070.05 222.538 1069.66 222.929L1064 228.586L1058.34 222.929C1057.95 222.538 1057.32 222.538 1056.93 222.929C1056.54 223.319 1056.54 223.953 1056.93 224.343L1063.29 230.707ZM1063 172L1063 230L1065 230L1065 172L1063 172Z" fill="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" letter-spacing="0em"><tspan x="30.5156" y="146.939">create token sets</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" letter-spacing="0em"><tspan x="4.08203" y="258.939">topic-token set similarity</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" letter-spacing="0em"><tspan x="356.066" y="379.939">document-topic distribution</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" letter-spacing="0em"><tspan x="179.961" y="548.939">multi-topic assignment </tspan><tspan x="239.375" y="572.939">on a token level</tspan></text>
<rect x="398" y="528" width="82" height="24" fill="#F1F1F1"/>
<rect x="478" y="528" width="82" height="24" fill="#0A539E"/>
<rect x="560" y="528" width="82" height="24" fill="#85BCDC"/>
<rect x="642" y="528" width="82" height="24" fill="#EAF2FB"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Quicksand" font-size="18" font-weight="bold" letter-spacing="0em"><tspan x="489" y="493.25">solving</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="16" font-weight="bold" letter-spacing="0em"><tspan x="412" y="544.852">topic 2</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="16" font-weight="bold" letter-spacing="0em"><tspan x="412" y="520.852">topic 1</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="16" font-weight="bold" letter-spacing="0em"><tspan x="412" y="568.852">topic 3</tspan></text>
<rect x="398" y="576" width="82" height="24" fill="#F1F1F1"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="16" font-weight="bold" letter-spacing="0em"><tspan x="412" y="592.852">topic 4</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="18" font-weight="bold" letter-spacing="0em"><tspan x="587" y="493.146">the</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="18" font-weight="bold" letter-spacing="0em"><tspan x="663" y="493.146">right</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="18" font-weight="bold" letter-spacing="0em"><tspan x="731" y="493.146">problem</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="18" font-weight="bold" letter-spacing="0em"><tspan x="840" y="493.146">is</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="18" font-weight="bold" letter-spacing="0em"><tspan x="897" y="493.146">difficult</tspan></text>
<line x1="398" y1="503" x2="970" y2="503" stroke="#BDBDBD" stroke-width="2"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="504" y="544.764">0.75</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="590" y="544.764">0.32</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="671" y="544.764">0.16</tspan></text>
<rect x="560" y="576" width="82" height="24" fill="#D0E1F2"/>
<rect x="642" y="576" width="82" height="24" fill="#B7D4EA"/>
<rect x="724" y="576" width="82" height="24" fill="#0A539E"/>
<rect x="806" y="576" width="82" height="24" fill="#85BCDC"/>
<rect x="888" y="576" width="82" height="24" fill="#B7D4EA"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="590" y="592.764">0.21</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="671" y="592.764">0.29</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="750" y="592.764">0.81</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="837" y="592.764">0.47</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="917" y="592.764">0.26</tspan></text>
<rect x="806" y="504" width="82" height="24" fill="#EAF2FB"/>
<rect x="888" y="504" width="82" height="24" fill="#85BCDC"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="837" y="520.764">0.12</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="917" y="520.764">0.33</tspan></text>
</svg>

After

Width:  |  Height:  |  Size: 18 KiB

@@ -0,0 +1,107 @@
BERTopic approaches topic modeling as a cluster task and attempts to cluster semantically similar documents to extract common topics. A disadvantage of using such a method is that each document is assigned to a single cluster and therefore also a single topic. In practice, documents may contain a mixture of topics. This can be accounted for by splitting up the documents into sentences and feeding those to BERTopic.
Another option is to use a cluster model that can perform soft clustering, like HDBSCAN. As BERTopic focuses on modularity, we may still want to model that mixture of topics even when we are using a hard-clustering model, like k-Means without the need to split up our documents. This is where `.approximate_distribution` comes in!
<br>
<div class="svg_image">
--8<-- "docs/getting_started/distribution/approximate_distribution.svg"
</div>
<br>
To perform this approximation, each document is split into tokens according to the provided tokenizer in the `CountVectorizer`. Then, a **sliding window** is applied on each document creating subsets of the document. For example, with a window size of 3 and stride of 1, the document:
> Solving the right problem is difficult.
can be split up into `solving the right`, `the right problem`, `right problem is`, and `problem is difficult`. These are called token sets.
For each of these token sets, we calculate their c-TF-IDF representation and find out how similar they are to the previously generated topics.
Then, the similarities to the topics for each token set are summed to create a topic distribution for the entire document.
Although it is often said that documents can contain a mixture of topics, these are often modeled by assigning each word to a single topic.
With this approach, we take into account that there may be multiple topics for a single word.
We can make this multiple-topic word assignment a bit more accurate by then splitting these token sets up into individual tokens and assigning
the topic distributions for each token set to each individual token. That way, we can visualize the extent to which a certain word contributes
to a document's topic distribution.
## **Example**
To calculate our topic distributions, we first need to fit a basic topic model:
```python
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
topic_model = BERTopic().fit(docs)
```
After doing so, we can approximate the topic distributions for your documents:
```python
topic_distr, _ = topic_model.approximate_distribution(docs)
```
The resulting `topic_distr` is a *n* x *m* matrix where *n* are the documents and *m* the topics. We can then visualize the distribution
of topics in a document:
```python
topic_model.visualize_distribution(topic_distr[1])
```
<iframe src="distribution_viz.html" style="width:1000px; height: 620px; border: 0px;""></iframe>
Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first
calculate topic distributions on a token level and then visualize the results:
```python
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)
# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])
df
```
<br><br>
<img src="distribution.png">
<br><br>
!!! tip
You can also approximate the topic distributions for unseen documents. It will not be as accurate as `.transform` but it is quite fast and can serve you well in a production setting.
!!! note
To get the stylized dataframe for `.visualize_approximate_distribution` you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via `pip install jinja2`
## **Parameters**
There are a few parameters that are of interest which will be discussed below.
### **batch_size**
Creating token sets for each document can result in quite a large list of token sets. The similarity of these token sets with the topics can result a large matrix that might not fit into memory anymore. To circumvent this, we can process batches of documents instead to minimize the memory overload. The value for `batch_size` indicates the number of documents that will be processed at once:
```python
topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=500)
```
### **window**
The number of tokens that are combined into token sets are defined by the `window` parameter. Seeing as we are performing a sliding window, we can change the size of the window. A larger window takes more tokens into account but setting it too large can result in considering too much information. Personally, I like to have this window between 4 and 8:
```python
topic_distr, _ = topic_model.approximate_distribution(docs, window=4)
```
### **stride**
The sliding window that is performed on a document shifts, as a default, 1 token to the right each time to create its token sets. As a result, especially with large windows, a single token gets judged several times. We can use the `stride` parameter to increase the number of tokens the window shifts to the right. By increasing
this value, we are judging each token less frequently which often results in a much faster calculation. Combining this parameter with `window` is preferred. For example, if we have a very large dataset, we can set `stride=4` and `window=8` to judge token sets that contain 8 tokens but that are shifted with 4 steps
each time. As a result, this increases the computational speed quite a bit:
```python
topic_distr, _ = topic_model.approximate_distribution(docs, window=4)
```
### **use_embedding_model**
As a default, we compare the c-TF-IDF calculations between the token sets and all topics. Due to its bag-of-word representation, this is quite fast. However, you might want to use the selected `embedding_model` instead to do this comparison. Do note that due to the many token sets, it is often computationally quite a bit slower:
```python
topic_distr, _ = topic_model.approximate_distribution(docs, use_embedding_model=True)
```
Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

File diff suppressed because one or more lines are too long
@@ -0,0 +1,402 @@
# Embedding Models
BERTopic starts with transforming our input documents into numerical representations. Although there are many ways this can be achieved, we typically use sentence-transformers (`"all-MiniLM-L6-v2"`) as it is quite capable of capturing the semantic similarity between documents.
However, there is not one perfect
embedding model and you might want to be using something entirely different for your use case. Since BERTopic assumes some independence among steps, we can allow for this modularity:
<figure markdown>
![Image title](embeddings.svg)
<figcaption></figcaption>
</figure>
This modularity allows us not only to choose any embedding model to convert our documents into numerical representations, we can use essentially any data to perform our clustering.
When new state-of-the-art pre-trained embedding models are released, BERTopic will be able to use them. As a result, BERTopic grows with any new models being released.
Out of the box, BERTopic supports several embedding techniques. In this section, we will go through several of them and how they can be implemented.
## **Sentence Transformers**
You can select any model from sentence-transformers [here](https://www.sbert.net/docs/pretrained_models.html)
and pass it through BERTopic with `embedding_model`:
```python
from bertopic import BERTopic
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")
```
Or select a SentenceTransformer model with your parameters:
```python
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=sentence_model)
```
!!! tip "Tip 1!"
This embedding back-end was put here first for a reason, sentence-transformers works amazing out of the box! Playing around with different models can give you great results. Also, make sure to frequently visit [this](https://www.sbert.net/docs/pretrained_models.html) page as new models are often released.
!!! tip "Tip 2!"
New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look.
Many of these models can be used with `SentenceTransformers` in BERTopic, like so:
```python
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
topic_model = BERTopic(embedding_model=embedding_model)
```
## **Model2Vec**
To use a blazingly fast [Model2Vec](https://github.com/MinishLab/model2vec) model, you first need to install model2vec:
```
pip install model2vec
```
Then, you can load in any of their models and pass it to BERTopic like so:
```python
from model2vec import StaticModel
embedding_model = StaticModel.from_pretrained("minishlab/potion-base-8M")
topic_model = BERTopic(embedding_model=embedding_model)
```
### **Distillation**
These models are extremely versatile and can be distilled from existing embedding model (like those compatible with `sentence-transformers`).
This distillation process doesn't require a vocabulary (as it uses the tokenizer's vocabulary) but can benefit from having one. Fortunately, this allows you to
use the vocabulary from your input documents to distill a model yourself.
Doing so requires you to install some additional dependencies of model2vec like so:
```
pip install model2vec[distill]
```
To then distill common embedding models, you need to import the `Model2VecBackend` from BERTopic:
```python
from bertopic.backend import Model2VecBackend
# Choose a model to distill (a non-Model2Vec model)
embedding_model = Model2VecBackend(
"sentence-transformers/all-MiniLM-L6-v2",
distill=True
)
topic_model = BERTopic(embedding_model=embedding_model)
```
You can also choose a custom vectorizer for creating the vocabulary and define custom arguments for the distillatio process:
```python
from bertopic.backend import Model2VecBackend
from sklearn.feature_extraction.text import CountVectorizer
# Choose a model to distill (a non-Model2Vec model)
embedding_model = Model2VecBackend(
"sentence-transformers/all-MiniLM-L6-v2",
distill=True,
distill_kwargs={"pca_dims": 256, "apply_zipf": True, "use_subword": True},
distill_vectorizer=CountVectorizer(ngram_range=(1, 3))
)
topic_model = BERTopic(embedding_model=embedding_model)
```
!!! tip "Tip!"
You can save the resulting model with `topic_model.embedding_model.embedding_model.save_pretrained("m2v_model")`.
## **🤗 Hugging Face Transformers**
To use a Hugging Face transformers model, load in a pipeline and point
to any model found on their model hub (https://huggingface.co/models):
```python
from transformers.pipelines import pipeline
embedding_model = pipeline("feature-extraction", model="distilbert-base-cased")
topic_model = BERTopic(embedding_model=embedding_model)
```
!!! tip "Tip!"
These transformers also work quite well using `sentence-transformers` which has great optimizations tricks that make using it a bit faster.
**Langchain**
[Langchain](https://python.langchain.com/docs/introduction) allows you to use different embedding models supported by various cloud providers. On top of that, it supports various integrations to open source models. To get started:
```python
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from bertopic.backend import LangChainBackend
hf_embedding = HuggingFaceInstructEmbeddings()
langchain_embedder = LangChainBackend(hf_embedding)
```
To see what providers are being supported by Langchain, you can check the list [here](https://python.langchain.com/docs/integrations/providers/).
For more information, you can have a look on [Langchain's Embedding Models](https://python.langchain.com/docs/integrations/text_embedding/).
## **Flair**
[Flair](https://github.com/flairNLP/flair) allows you to choose almost any embedding model that
is publicly available. Flair can be used as follows:
```python
from flair.embeddings import TransformerDocumentEmbeddings
roberta = TransformerDocumentEmbeddings('roberta-base')
topic_model = BERTopic(embedding_model=roberta)
```
You can select any 🤗 transformers model [here](https://huggingface.co/models).
Moreover, you can also use Flair to use word embeddings and pool them to create document embeddings.
Under the hood, Flair simply averages all word embeddings in a document. Then, we can easily
pass it to BERTopic to use those word embeddings as document embeddings:
```python
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
glove_embedding = WordEmbeddings('crawl')
document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding])
topic_model = BERTopic(embedding_model=document_glove_embeddings)
```
## **Spacy**
[Spacy](https://github.com/explosion/spaCy) is an amazing framework for processing text. There are
many models available across many languages for modeling text.
To use Spacy's non-transformer models in BERTopic:
```python
import spacy
nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner',
'attribute_ruler', 'lemmatizer'])
topic_model = BERTopic(embedding_model=nlp)
```
Using spacy-transformer models:
```python
import spacy
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner',
'attribute_ruler', 'lemmatizer'])
topic_model = BERTopic(embedding_model=nlp)
```
If you run into memory issues with spacy-transformer models, try:
```python
import spacy
from thinc.api import set_gpu_allocator, require_gpu
nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner',
'attribute_ruler', 'lemmatizer'])
set_gpu_allocator("pytorch")
require_gpu(0)
topic_model = BERTopic(embedding_model=nlp)
```
## **Universal Sentence Encoder (USE)**
The Universal Sentence Encoder encodes text into high-dimensional vectors that are used here
for embedding the documents. The model is trained and optimized for greater-than-word length text,
such as sentences, phrases, or short paragraphs.
Using USE in BERTopic is rather straightforward:
```python
import tensorflow_hub
embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
topic_model = BERTopic(embedding_model=embedding_model)
```
## **Gensim**
BERTopic supports the `gensim.downloader` module, which allows it to download any word embedding model supported by Gensim.
Typically, these are Glove, Word2Vec, or FastText embeddings:
```python
import gensim.downloader as api
ft = api.load('fasttext-wiki-news-subwords-300')
topic_model = BERTopic(embedding_model=ft)
```
!!! tip "Tip!"
Gensim is primarily used for Word Embedding models. This works typically best for short documents since the word embeddings are pooled.
## **Scikit-Learn Embeddings**
Scikit-Learn is a framework for more than just machine learning.
It offers many preprocessing tools, some of which can be used to create representations
for text. Many of these tools are relatively lightweight and do not require a GPU.
While the representations may be less expressive than many BERT models, the fact that
it runs much faster can make it a relevant candidate to consider.
If you have a scikit-learn compatible pipeline that you'd like to use to embed
text then you can also pass this to BERTopic.
```python
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
pipe = make_pipeline(
TfidfVectorizer(),
TruncatedSVD(100)
)
topic_model = BERTopic(embedding_model=pipe)
```
!!! Warning
One caveat to be aware of is that scikit-learns base `Pipeline` class does not
support the `.partial_fit()`-API. If you have a pipeline that theoretically should
be able to support online learning then you might want to explore
the [scikit-partial](https://github.com/koaning/scikit-partial) project.
Moreover, since this backend does not generate representations on a word level,
it does not support the `bertopic.representation` models.
## **OpenAI**
To use OpenAI's external API, we need to define our key and explicitly call `bertopic.backend.OpenAIBackend`
to be used in our topic model:
```python
import openai
from bertopic.backend import OpenAIBackend
client = openai.OpenAI(api_key="sk-...")
embedding_model = OpenAIBackend(client, "text-embedding-ada-002")
topic_model = BERTopic(embedding_model=embedding_model)
```
## **Cohere**
To use Cohere's external API, we need to define our key and explicitly call `bertopic.backend.CohereBackend`
to be used in our topic model:
```python
import cohere
from bertopic.backend import CohereBackend
client = cohere.Client("MY_API_KEY")
embedding_model = CohereBackend(client)
topic_model = BERTopic(embedding_model=embedding_model)
```
## **FastEmbed**
FastEmbed[https://qdrant.tech/documentation/fastembed/] is a lightweight python library for embedding generation
and it supports popular embedding models.
You can easily use it as in the example below:
```python
from bertopic.backend import FastEmbedBackend
embedding_model = FastEmbedBackend("BAAI/bge-small-en-v1.5")
topic_model = BERTopic(embedding_model=embedding_model)
```
!!! tip "Tip!"
Before to start check the supported FastEmbed text embedding models [here](https://qdrant.github.io/fastembed/examples/Supported_Models/).
## **Multimodal**
To create embeddings for both text and images in the same vector space, we can use the `MultiModalBackend`.
This model uses a clip-vit based model that is capable of embedding text, images, or both:
```python
from bertopic.backend import MultiModalBackend
model = MultiModalBackend('clip-ViT-B-32', batch_size=32)
# Embed documents only
doc_embeddings = model.embed_documents(docs)
# Embedding images only
image_embeddings = model.embed_images(images)
# Embed both images and documents, then average them
doc_image_embeddings = model.embed(docs, images)
```
## **Custom Backend**
If your backend or model cannot be found in the ones currently available, you can use the `bertopic.backend.BaseEmbedder` class to
create your backend. Below, you will find an example of creating a SentenceTransformer backend for BERTopic:
```python
from bertopic.backend import BaseEmbedder
from sentence_transformers import SentenceTransformer
class CustomEmbedder(BaseEmbedder):
def __init__(self, embedding_model):
super().__init__()
self.embedding_model = embedding_model
def embed(self, documents, verbose=False):
embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
return embeddings
# Create custom backend
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
custom_embedder = CustomEmbedder(embedding_model=embedding_model)
# Pass custom backend to bertopic
topic_model = BERTopic(embedding_model=custom_embedder)
```
## **Custom Embeddings**
The base models in BERTopic are BERT-based models that work well with document similarity tasks. Your documents,
however, might be too specific for a general pre-trained model to be used. Fortunately, you can use the embedding
model in BERTopic to create document features.
You only need to prepare the document embeddings yourself and pass them through `fit_transform` of BERTopic:
```python
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer
# Prepare embeddings
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)
# Train our topic model using our pre-trained sentence-transformers embeddings
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs, embeddings)
```
As you can see above, we used a SentenceTransformer model to create the embedding. You could also have used
`🤗 transformers`, `Doc2Vec`, or any other embedding method.
### **TF-IDF**
As mentioned above, any embedding technique can be used. However, when running UMAP, the typical distance metric is
`cosine` which does not work quite well for a TF-IDF matrix. Instead, BERTopic will recognize that a sparse matrix
is passed and use `hellinger` instead which works quite well for the similarity between probability distributions.
We simply create a TF-IDF matrix and use them as embeddings in our `fit_transform` method:
```python
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
# Create TF-IDF sparse matrix
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
vectorizer = TfidfVectorizer(min_df=5)
embeddings = vectorizer.fit_transform(docs)
# Train our topic model using TF-IDF vectors
topic_model = BERTopic(stop_words="english")
topics, probs = topic_model.fit_transform(docs, embeddings)
```
Here, you will probably notice that creating the embeddings is quite fast whereas `fit_transform` is quite slow.
This is to be expected as reducing the dimensionality of a large sparse matrix takes some time. The inverse of using
transformer embeddings is true: creating the embeddings is slow whereas `fit_transform` is quite fast.
@@ -0,0 +1,105 @@
<svg width="458" height="285" viewBox="0 0 458 285" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect y="244" width="118" height="38" fill="#64B5F6"/>
<rect x="92" y="234" width="20" height="8" fill="#64B5F6"/>
<rect x="64" y="234" width="20" height="8" fill="#64B5F6"/>
<rect x="36" y="234" width="20" height="8" fill="#64B5F6"/>
<rect x="8" y="234" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="27.9355" y="271.939">SpaCy</tspan></text>
<rect x="132" y="244" width="118" height="38" fill="#64B5F6"/>
<rect x="224" y="234" width="20" height="8" fill="#64B5F6"/>
<rect x="196" y="234" width="20" height="8" fill="#64B5F6"/>
<rect x="168" y="234" width="20" height="8" fill="#64B5F6"/>
<rect x="140" y="234" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="158.256" y="271.939">SBERT</tspan></text>
<rect x="327" y="244" width="118" height="38" fill="#64B5F6"/>
<rect x="419" y="234" width="20" height="8" fill="#64B5F6"/>
<rect x="391" y="234" width="20" height="8" fill="#64B5F6"/>
<rect x="363" y="234" width="20" height="8" fill="#64B5F6"/>
<rect x="335" y="234" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="358.623" y="266.764">Transformers</tspan></text>
<circle cx="266.5" cy="262.5" r="5.5" fill="black"/>
<circle cx="285.5" cy="262.5" r="5.5" fill="black"/>
<circle cx="307.5" cy="262.5" r="5.5" fill="black"/>
<path d="M345.944 270.706C342.02 270.706 337.787 268.245 337.787 262.845C337.787 257.445 342.02 254.984 345.944 254.984C348.123 254.984 350.134 255.702 351.625 257.009C353.242 258.444 354.1 260.469 354.1 262.845C354.1 265.222 353.242 267.233 351.625 268.667C350.134 269.975 348.109 270.706 345.944 270.706Z" fill="url(#paint0_radial_22_799)"/>
<path d="M345.944 270.706C342.02 270.706 337.787 268.245 337.787 262.845C337.787 257.445 342.02 254.984 345.944 254.984C348.123 254.984 350.134 255.702 351.625 257.009C353.242 258.444 354.1 260.469 354.1 262.845C354.1 265.222 353.242 267.233 351.625 268.667C350.134 269.975 348.109 270.706 345.944 270.706Z" fill="url(#paint1_linear_22_799)"/>
<path d="M352.678 258.172C353.428 259.382 353.819 260.822 353.819 262.423C353.819 264.8 352.961 266.811 351.344 268.245C349.853 269.553 347.828 270.284 345.663 270.284C343.123 270.284 340.454 269.251 338.887 267.058C340.4 269.542 343.245 270.706 345.944 270.706C348.109 270.706 350.134 269.975 351.625 268.667C353.242 267.233 354.1 265.222 354.1 262.845C354.1 261.054 353.612 259.462 352.678 258.172Z" fill="#EB8F00"/>
<path opacity="0.8" d="M341.177 265.306C342.443 265.306 343.469 264.28 343.469 263.014C343.469 261.748 342.443 260.722 341.177 260.722C339.911 260.722 338.884 261.748 338.884 263.014C338.884 264.28 339.911 265.306 341.177 265.306Z" fill="url(#paint2_radial_22_799)"/>
<path opacity="0.8" d="M350.627 263.534C351.892 263.534 352.919 262.508 352.919 261.242C352.919 259.976 351.892 258.95 350.627 258.95C349.361 258.95 348.334 259.976 348.334 261.242C348.334 262.508 349.361 263.534 350.627 263.534Z" fill="url(#paint3_radial_22_799)"/>
<path d="M350.627 260.947C349.544 261.945 348.109 262.733 346.45 263.113C344.791 263.492 343.159 263.422 341.753 262.986C341.5 262.902 341.317 263.239 341.542 263.408C342.963 264.463 344.861 264.913 346.759 264.463C348.658 264.027 350.177 262.803 350.992 261.228C351.133 260.989 350.823 260.764 350.627 260.947Z" fill="#422B0D"/>
<path d="M343.82 260.638L343.792 260.609C343.778 260.595 343.75 260.581 343.722 260.553C343.694 260.539 343.666 260.511 343.637 260.497C343.609 260.469 343.567 260.441 343.525 260.413C343.483 260.384 343.441 260.356 343.398 260.342C343.356 260.314 343.314 260.3 343.272 260.3C343.23 260.286 343.202 260.286 343.187 260.286H343.159H343.145H343.173L343.103 260.3C343.089 260.3 343.103 260.3 343.103 260.3H343.117C343.131 260.3 343.117 260.3 343.117 260.3H343.103C343.089 260.314 343.061 260.328 343.033 260.342C343.005 260.37 342.962 260.398 342.934 260.427C342.906 260.455 342.878 260.497 342.85 260.539C342.794 260.623 342.752 260.694 342.723 260.75C342.695 260.806 342.667 260.834 342.667 260.834L342.639 260.891C342.498 261.13 342.189 261.228 341.936 261.102C341.767 261.017 341.669 260.863 341.655 260.694V260.539C341.669 260.441 341.683 260.3 341.739 260.131C341.795 259.963 341.894 259.752 342.077 259.555C342.175 259.456 342.287 259.344 342.428 259.273C342.456 259.245 342.498 259.231 342.541 259.217C342.583 259.203 342.611 259.175 342.667 259.161L342.737 259.133C342.766 259.119 342.794 259.119 342.808 259.119L342.878 259.105L342.92 259.091H342.977L343.047 259.077H343.173C343.258 259.077 343.342 259.077 343.427 259.091C343.595 259.119 343.736 259.175 343.862 259.231C344.116 259.358 344.27 259.513 344.397 259.639C344.453 259.709 344.509 259.766 344.537 259.822C344.58 259.878 344.608 259.934 344.636 259.977C344.664 260.019 344.664 260.047 344.678 260.061C344.678 260.075 344.692 260.089 344.692 260.089C344.791 260.342 344.65 260.623 344.369 260.722C344.172 260.792 343.961 260.75 343.82 260.638Z" fill="#422B0D"/>
<path d="M348.644 259.695L348.616 259.667C348.602 259.653 348.573 259.639 348.545 259.611C348.517 259.597 348.489 259.569 348.461 259.555C348.433 259.527 348.391 259.498 348.348 259.47C348.306 259.442 348.264 259.414 348.222 259.4C348.18 259.372 348.138 259.358 348.095 259.358C348.067 259.344 348.039 259.344 348.025 259.344H347.983H348.011L347.941 259.358C347.927 259.358 347.941 259.358 347.941 259.358H347.955C347.969 259.358 347.955 259.358 347.955 259.358H347.941C347.927 259.372 347.898 259.386 347.856 259.4C347.828 259.428 347.8 259.456 347.758 259.484C347.73 259.513 347.702 259.555 347.673 259.597C347.617 259.681 347.575 259.752 347.547 259.808C347.519 259.864 347.491 259.892 347.491 259.892L347.463 259.934C347.322 260.173 347.013 260.272 346.759 260.145C346.591 260.061 346.492 259.906 346.478 259.738C346.478 259.738 346.478 259.681 346.492 259.583C346.506 259.484 346.52 259.344 346.577 259.175C346.633 259.006 346.731 258.795 346.914 258.598C347.013 258.5 347.125 258.388 347.266 258.317C347.294 258.303 347.336 258.275 347.378 258.261C347.42 258.247 347.448 258.219 347.505 258.205L347.589 258.177C347.617 258.163 347.645 258.163 347.659 258.163L347.73 258.148L347.772 258.134H347.828L347.898 258.12H348.025C348.109 258.12 348.194 258.12 348.278 258.134C348.447 258.163 348.588 258.219 348.714 258.275C348.967 258.402 349.122 258.556 349.248 258.683C349.305 258.753 349.361 258.809 349.389 258.866C349.431 258.922 349.459 258.978 349.488 259.02C349.516 259.063 349.516 259.091 349.53 259.105C349.544 259.119 349.544 259.133 349.544 259.133C349.642 259.386 349.502 259.667 349.22 259.766C348.995 259.864 348.784 259.822 348.644 259.695Z" fill="#422B0D"/>
<path d="M343.029 265.62C342.993 265.27 343.009 265.07 342.74 265.007C342.439 264.936 342.171 265.152 342.089 265.46C341.874 266.284 342.24 266.859 342.24 266.859C341.719 266.756 341.285 266.25 341.285 266.25C341.013 265.931 340.631 265.138 340.367 264.811C340.206 264.613 339.943 264.451 339.682 264.588C339.055 264.917 339.749 266.036 339.983 266.334C340.248 266.673 339.292 265.589 339.152 265.353C338.89 264.914 338.569 264.797 338.301 265.026C338.032 265.254 338.247 265.824 338.37 266.04C339.119 267.355 339.773 267.677 339.773 267.677C339.773 267.677 339.077 267.343 338.484 266.624C337.924 265.946 337.392 266.662 337.765 267.264C337.844 267.392 337.994 267.722 338.478 268.14C338.849 268.46 338.607 268.268 338.478 268.14C337.983 267.652 337.456 268.172 337.615 268.598C337.799 269.089 338.399 269.543 338.806 269.844C339.596 270.429 340.582 270.898 341.58 270.865C342.691 270.83 343.511 270.367 343.757 268.993C343.871 268.355 343.515 267.376 343.41 267.126C343.065 266.316 343.048 265.818 343.029 265.62Z" fill="url(#paint4_radial_22_799)"/>
<path d="M341.913 269.038C341.764 268.829 341.344 267.856 342.129 266.86L342.356 267.004C341.468 268.1 342.089 269.037 342.198 269.249C342.198 269.249 342.055 269.24 341.913 269.038Z" fill="#EB8F00"/>
<path d="M343.497 267.036C343.148 266.36 343.123 266.008 343.099 265.623C343.072 265.344 343.053 265 342.717 264.921C342.573 264.887 342.373 264.907 342.196 265.102C341.753 265.595 342.106 266.739 342.106 266.739C341.469 266.441 341.362 266.178 340.911 265.499C340.714 265.201 340.4 264.65 340.114 264.489C339.908 264.374 339.604 264.46 339.453 264.665C339.111 265.123 339.72 266.053 339.72 266.053C339.72 266.053 339.384 265.564 339.237 265.308C338.989 264.873 338.533 264.633 338.197 264.935C337.648 265.429 338.714 266.756 338.714 266.756C338.714 266.756 338.224 265.978 337.734 266.308C337.48 266.479 337.38 266.926 337.644 267.357C337.761 267.552 338.058 267.898 338.058 267.898C338.058 267.898 337.938 267.822 337.738 267.887C337.516 267.96 337.326 268.235 337.488 268.666C337.675 269.169 338.229 269.605 338.734 269.969C339.883 270.799 341.226 270.85 341.226 270.85C341.226 270.85 339.754 270.567 338.811 269.709C338.516 269.441 337.346 268.624 337.764 268.199C337.862 268.098 338.042 268.047 338.308 268.258C339.243 269.003 339.562 268.83 339.562 268.83C339.596 268.701 339.503 268.646 339.225 268.482C339.032 268.368 338.768 268.212 338.572 268.044C338.093 267.629 337.551 266.908 337.848 266.587C338.007 266.416 338.191 266.452 338.37 266.637C339.613 267.905 339.905 267.726 339.905 267.726C339.905 267.726 339.943 267.589 339.551 267.244C339.25 266.981 338.88 266.656 338.423 265.86C338.302 265.649 338.24 265.274 338.418 265.126C338.657 264.924 338.877 265.057 339.076 265.354C339.318 265.715 339.661 266.16 340.002 266.488C340.416 266.883 340.638 267.004 340.652 266.99C340.751 266.883 340.327 266.468 340.139 266.209C339.475 265.294 339.485 264.993 339.623 264.755C339.71 264.605 339.95 264.537 340.24 264.901C340.864 265.621 341.179 266.793 342.355 267.001C342.355 267.001 342.4 266.93 342.295 266.549C342.168 266.151 342.147 265.813 342.212 265.516C342.283 265.188 342.546 265.095 342.691 265.13C342.885 265.177 342.863 265.503 342.863 265.503C342.863 265.503 342.874 265.623 342.88 265.678C342.902 265.914 342.949 266.406 343.275 267.165C343.406 267.469 344.103 268.718 343.158 270.28C343.158 270.28 343.559 270.383 344.019 270.49C344.019 270.49 344.2 269.85 344.116 268.856C344.013 267.605 343.732 267.492 343.497 267.036Z" fill="#EB8F00"/>
<path d="M337.077 268.438C337.082 268.399 337.087 268.365 337.094 268.335C337.084 268.368 337.079 268.401 337.077 268.438Z" fill="url(#paint5_radial_22_799)"/>
<path d="M348.843 265.62C348.879 265.27 348.865 265.077 349.132 265.007C349.419 264.932 349.696 265.153 349.783 265.46C349.984 266.18 349.599 266.859 349.599 266.859C350.119 266.756 350.56 266.257 350.56 266.257C350.832 265.938 351.242 265.138 351.505 264.811C351.666 264.613 351.856 264.485 352.131 264.586C352.701 264.797 352.106 265.95 351.872 266.25C351.607 266.589 351.775 266.534 352.133 266.142C352.491 265.746 352.681 265.523 352.822 265.288C353.083 264.849 353.397 264.883 353.571 265.026C353.844 265.251 353.625 265.824 353.502 266.04C352.753 267.355 353.388 266.624 353.388 266.624C353.948 265.946 354.48 266.662 354.107 267.264C354.028 267.392 353.878 267.722 353.394 268.14C353.023 268.46 353.265 268.268 353.394 268.14C353.889 267.652 354.416 268.172 354.257 268.598C354.073 269.089 353.473 269.543 353.066 269.844C352.276 270.429 351.29 270.898 350.292 270.865C349.181 270.83 348.361 270.367 348.115 268.993C348.001 268.355 348.357 267.376 348.462 267.126C348.807 266.316 348.824 265.818 348.843 265.62Z" fill="url(#paint6_radial_22_799)"/>
<path d="M349.959 269.038C350.108 268.829 350.528 267.856 349.743 266.86L349.516 267.004C350.404 268.1 349.783 269.037 349.675 269.249C349.675 269.249 349.817 269.24 349.959 269.038Z" fill="#EB8F00"/>
<path d="M348.375 267.036C348.724 266.36 348.749 266.008 348.773 265.623C348.8 265.344 348.82 265 349.156 264.921C349.299 264.887 349.499 264.907 349.676 265.102C350.119 265.595 349.766 266.739 349.766 266.739C350.403 266.441 350.51 266.178 350.961 265.499C351.158 265.201 351.472 264.65 351.759 264.489C351.964 264.374 352.268 264.46 352.42 264.665C352.761 265.123 352.152 266.053 352.152 266.053C352.152 266.053 352.488 265.564 352.635 265.308C352.884 264.873 353.339 264.633 353.675 264.935C354.224 265.429 353.158 266.756 353.158 266.756C353.158 266.756 353.649 265.978 354.138 266.308C354.393 266.479 354.492 266.926 354.228 267.357C354.111 267.552 353.815 267.898 353.815 267.898C353.815 267.898 353.934 267.822 354.134 267.887C354.356 267.96 354.546 268.235 354.384 268.666C354.197 269.169 353.643 269.605 353.138 269.969C351.989 270.799 350.646 270.85 350.646 270.85C350.646 270.85 352.119 270.567 353.061 269.709C353.356 269.441 354.526 268.624 354.108 268.199C354.01 268.098 353.83 268.047 353.564 268.258C352.629 269.003 352.31 268.83 352.31 268.83C352.276 268.701 352.369 268.646 352.647 268.482C352.84 268.368 353.104 268.212 353.3 268.044C353.779 267.629 354.321 266.908 354.024 266.587C353.865 266.416 353.681 266.452 353.502 266.637C352.259 267.905 351.967 267.726 351.967 267.726C351.967 267.726 351.929 267.589 352.321 267.244C352.622 266.981 352.992 266.656 353.449 265.86C353.57 265.649 353.632 265.274 353.455 265.126C353.216 264.924 352.995 265.057 352.796 265.354C352.555 265.715 352.211 266.16 351.87 266.488C351.456 266.883 351.234 267.004 351.22 266.99C351.122 266.883 351.545 266.468 351.733 266.209C352.397 265.294 352.387 264.993 352.249 264.755C352.162 264.605 351.922 264.537 351.632 264.901C351.005 265.624 350.69 266.795 349.516 267.004C349.516 267.004 349.471 266.933 349.576 266.552C349.703 266.154 349.724 265.815 349.659 265.519C349.587 265.191 349.324 265.098 349.18 265.133C348.986 265.18 349.008 265.506 349.008 265.506C349.008 265.506 348.997 265.625 348.991 265.68C348.969 265.917 348.922 266.409 348.596 267.168C348.465 267.472 347.766 268.733 348.713 270.283C348.713 270.283 348.312 270.386 347.852 270.493C347.852 270.493 347.671 269.853 347.755 268.858C347.859 267.605 348.14 267.492 348.375 267.036Z" fill="#EB8F00"/>
<path d="M354.795 268.438C354.79 268.399 354.785 268.365 354.778 268.335C354.788 268.368 354.793 268.401 354.795 268.438Z" fill="url(#paint7_radial_22_799)"/>
<rect x="132" y="170" width="118" height="38" fill="#E57373"/>
<rect x="224" y="160" width="20" height="8" fill="#E57373"/>
<rect x="196" y="160" width="20" height="8" fill="#E57373"/>
<rect x="168" y="160" width="20" height="8" fill="#E57373"/>
<rect x="140" y="160" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="161.254" y="197.939">UMAP</tspan></text>
<rect x="132" y="130" width="118" height="38" fill="#4DB6AC"/>
<rect x="224" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="196" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="168" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="140" y="120" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="141.342" y="157.939">HDBSCAN</tspan></text>
<rect x="132" y="90" width="118" height="38" fill="#FFD54F"/>
<rect x="224" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="196" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="168" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="140" y="80" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="138.346" y="114.161">CountVectorizer</tspan></text>
<rect x="132" y="50" width="118" height="38" fill="#90A4AE"/>
<rect x="224" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="196" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="168" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="140" y="40" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="146.938" y="77.9395">c-TF-IDF</tspan></text>
<rect x="132" y="10" width="118" height="38" fill="#3F51B5"/>
<rect x="224" width="20" height="8" fill="#3F51B5"/>
<rect x="196" width="20" height="8" fill="#3F51B5"/>
<rect x="168" width="20" height="8" fill="#3F51B5"/>
<rect x="140" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="161.065" y="25.0576">Optional&#10;</tspan><tspan x="150.271" y="42.0576">Fine-tuning</tspan></text>
<defs>
<radialGradient id="paint0_radial_22_799" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(345.944 262.845) scale(8.01)">
<stop offset="0.5" stop-color="#FDE030"/>
<stop offset="0.919" stop-color="#F7C02B"/>
<stop offset="1" stop-color="#F4A223"/>
</radialGradient>
<linearGradient id="paint1_linear_22_799" x1="345.944" y1="270.706" x2="345.944" y2="254.984" gradientUnits="userSpaceOnUse">
<stop offset="0.158" stop-color="#F4A223"/>
<stop offset="0.333" stop-color="#F7C02B"/>
<stop offset="0.807" stop-color="#FDE030" stop-opacity="0"/>
</linearGradient>
<radialGradient id="paint2_radial_22_799" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(341.172 263.015) rotate(-10.6121) scale(2.49432 2.36965)">
<stop stop-color="#ED7770"/>
<stop offset="0.9" stop-color="#ED7770" stop-opacity="0"/>
</radialGradient>
<radialGradient id="paint3_radial_22_799" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(350.628 261.247) rotate(-10.6121) scale(2.49404 2.36938)">
<stop stop-color="#ED7770"/>
<stop offset="0.9" stop-color="#ED7770" stop-opacity="0"/>
</radialGradient>
<radialGradient id="paint4_radial_22_799" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(341.552 267.229) scale(5.41589)">
<stop offset="0.33" stop-color="#FFF176"/>
<stop offset="1" stop-color="#FFC400"/>
</radialGradient>
<radialGradient id="paint5_radial_22_799" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(341.104 267.2) scale(5.47088 5.47084)">
<stop offset="0.33" stop-color="#FFF176"/>
<stop offset="1" stop-color="#FFC400"/>
</radialGradient>
<radialGradient id="paint6_radial_22_799" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(350.321 267.236) rotate(180) scale(5.40773)">
<stop offset="0.33" stop-color="#FFF176"/>
<stop offset="1" stop-color="#FFC400"/>
</radialGradient>
<radialGradient id="paint7_radial_22_799" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse" gradientTransform="translate(350.768 267.2) rotate(180) scale(5.471 5.47084)">
<stop offset="0.33" stop-color="#FFF176"/>
<stop offset="1" stop-color="#FFC400"/>
</radialGradient>
</defs>
</svg>

After

Width:  |  Height:  |  Size: 19 KiB

@@ -0,0 +1,48 @@
Guided Topic Modeling or Seeded Topic Modeling is a collection of techniques that guides the topic modeling approach by setting several seed topics to which the model will converge to. These techniques allow the user to set a predefined number of topic representations that are sure to be in documents. For example, take an IT business that has a ticket system for the software their clients use. Those tickets may typically contain information about a specific bug regarding login issues that the IT business is aware of.
To model that bug, we can create a seed topic representation containing the words `bug`, `login`, `password`,
and `username`. By defining those words, a Guided Topic Modeling approach will try to converge at least one topic to those words.
<br>
<div class="svg_image">
--8<-- "docs/getting_started/guided/guided.svg"
</div>
<br>
Guided BERTopic has two main steps:
First, we create embeddings for each seeded topic by joining them and passing them through the document embedder. These embeddings will be compared with the existing document embeddings through cosine similarity and assigned a label. If the document is most similar to a seeded topic, then it will get that topic's label.
If it is most similar to the average document embedding, it will get the -1 label.
These labels are then passed through UMAP to create a semi-supervised approach that should nudge
the topic creation to the seeded topics.
Second, we take all words in seed_topic_list and assign them a multiplier larger than 1.
Those multipliers will be used to increase the IDF values of the words across all topics thereby increasing
the likelihood that a seeded topic word will appear in a topic. This does, however, also increase the chance of an irrelevant topic having unrelated words. In practice, this should not be an issue since the IDF value is likely to remain low regardless of the multiplier. The multiplier is now a fixed value but may change to something more elegant, like taking the distribution of IDF values and its position into account when defining the multiplier.
### **Example**
To demonstrate Guided BERTopic, we use the 20 Newsgroups dataset as our example. We have frequently used this
dataset in BERTopic examples and we sometimes see a topic generated about health with words such as `drug` and `cancer`
being important. However, due to the stochastic nature of UMAP, this topic is not always found.
In order to guide BERTopic to that topic, we create a seed topic list that we pass through our model. However,
there may be several other topics that we know should be in the documents. Let's also initialize those:
```python
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))["data"]
seed_topic_list = [["drug", "cancer", "drugs", "doctor"],
["windows", "drive", "dos", "file"],
["space", "launch", "orbit", "lunar"]]
topic_model = BERTopic(seed_topic_list=seed_topic_list)
topics, probs = topic_model.fit_transform(docs)
```
As you can see above, the `seed_topic_list` contains a list of topic representations. By defining the above topics
BERTopic is more likely to model the defined seeded topics. However, BERTopic is merely nudged towards creating those
topics. In practice, if the seeded topics do not exist or might be divided into smaller topics, then they will
not be modeled. Thus, seed topics need to be accurate to accurately converge towards them.
@@ -0,0 +1,152 @@
<svg width="718" height="527" viewBox="0 0 718 527" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="44.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="58.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="72.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="86.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="100.5" y="115.5" width="14" height="14" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="33.3086" y="108.176">&#34;drug cancer drugs doctor&#34;&#10;</tspan></text>
<rect x="227.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="241.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="255.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="269.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="283.5" y="115.5" width="14" height="14" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="221.75" y="108.176">&#34;windows drive dos file&#34;&#10;</tspan><tspan x="264" y="118.176">&#10;</tspan></text>
<rect x="413.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="427.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="441.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="455.5" y="115.5" width="14" height="14" stroke="black"/>
<rect x="469.5" y="115.5" width="14" height="14" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="404.438" y="108.176">&#34;space launch orbit lunar&#34;&#10;</tspan><tspan x="450" y="118.176">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="522" y="104.764">Concatenate and embed the </tspan><tspan x="522" y="118.764">keywords/keyphrases using the </tspan><tspan x="522" y="132.764">embedding model.</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="408" y="200.764">For each document, generate labels by finding </tspan><tspan x="408" y="214.764">which seeded topic fits best based on cosine </tspan><tspan x="408" y="228.764">similarity between embeddings.</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="366" y="313.764">Average the embedding of each document </tspan><tspan x="366" y="327.764">with the selected seeded topic. </tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="550" y="24.7637">Define seed topics through </tspan><tspan x="550" y="38.7637">keywords or keyphrases. </tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="11" y="29.9697">&#34;drug&#34;, &#34;cancer&#34;, &#34;drugs&#34;, &#34;doctor&#34;&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" font-weight="bold" letter-spacing="0em"><tspan x="53" y="9.96973">Seed topic 1</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" font-weight="bold" letter-spacing="0em"><tspan x="233" y="9.96973">Seed topic 2</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" font-weight="bold" letter-spacing="0em"><tspan x="413" y="9.96973">Seed topic 3</tspan></text>
<rect x="0.5" y="14.5" width="170" height="24" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="198" y="29.9697">&#34;windows&#34;, &#34;drive&#34;, &#34;dos&#34;, &#34;file&#34;&#10;</tspan></text>
<rect x="185.5" y="14.5" width="170" height="24" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="378" y="29.9697">&#34;space&#34;, &#34;launch&#34;, &#34;orbit&#34;, &#34;lunar&#34;&#10;</tspan></text>
<rect x="365.5" y="14.5" width="170" height="24" stroke="black"/>
<path d="M445.293 86.7071C445.683 87.0976 446.317 87.0976 446.707 86.7071L453.071 80.3431C453.462 79.9526 453.462 79.3195 453.071 78.9289C452.681 78.5384 452.047 78.5384 451.657 78.9289L446 84.5858L440.343 78.9289C439.953 78.5384 439.319 78.5384 438.929 78.9289C438.538 79.3195 438.538 79.9526 438.929 80.3431L445.293 86.7071ZM445 51L445 86L447 86L447 51L445 51Z" fill="black"/>
<path d="M263.293 86.7071C263.683 87.0976 264.317 87.0976 264.707 86.7071L271.071 80.3431C271.462 79.9526 271.462 79.3195 271.071 78.9289C270.681 78.5384 270.047 78.5384 269.657 78.9289L264 84.5858L258.343 78.9289C257.953 78.5384 257.319 78.5384 256.929 78.9289C256.538 79.3195 256.538 79.9526 256.929 80.3431L263.293 86.7071ZM263 51L263 86L265 86L265 51L263 51Z" fill="black"/>
<path d="M263.293 166.707C263.683 167.098 264.317 167.098 264.707 166.707L271.071 160.343C271.462 159.953 271.462 159.319 271.071 158.929C270.681 158.538 270.047 158.538 269.657 158.929L264 164.586L258.343 158.929C257.953 158.538 257.319 158.538 256.929 158.929C256.538 159.319 256.538 159.953 256.929 160.343L263.293 166.707ZM263 141L263 166L265 166L265 141L263 141Z" fill="black"/>
<path d="M263.293 272.707C263.683 273.098 264.317 273.098 264.707 272.707L271.071 266.343C271.462 265.953 271.462 265.319 271.071 264.929C270.681 264.538 270.047 264.538 269.657 264.929L264 270.586L258.343 264.929C257.953 264.538 257.319 264.538 256.929 264.929C256.538 265.319 256.538 265.953 256.929 266.343L263.293 272.707ZM263 247L263 272L265 272L265 247L263 247Z" fill="black"/>
<path d="M80.2929 86.7071C80.6834 87.0976 81.3166 87.0976 81.7071 86.7071L88.0711 80.3431C88.4616 79.9526 88.4616 79.3195 88.0711 78.9289C87.6805 78.5384 87.0474 78.5384 86.6569 78.9289L81 84.5858L75.3431 78.9289C74.9526 78.5384 74.3195 78.5384 73.9289 78.9289C73.5384 79.3195 73.5384 79.9526 73.9289 80.3431L80.2929 86.7071ZM80 51L80 86L82 86L82 51L80 51Z" fill="black"/>
<path d="M117.951 166.912C118.495 166.818 118.86 166.301 118.766 165.757L117.231 156.888C117.137 156.344 116.619 155.979 116.075 156.074C115.531 156.168 115.166 156.685 115.26 157.229L116.624 165.112L108.742 166.477C108.197 166.571 107.833 167.088 107.927 167.632C108.021 168.177 108.538 168.541 109.083 168.447L117.951 166.912ZM80.4238 140.817L117.204 166.744L118.356 165.11L81.5762 139.183L80.4238 140.817Z" fill="black"/>
<path d="M410.049 166.912C409.505 166.818 409.14 166.301 409.234 165.757L410.769 156.888C410.863 156.344 411.381 155.979 411.925 156.074C412.469 156.168 412.834 156.685 412.74 157.229L411.376 165.112L419.258 166.477C419.803 166.571 420.167 167.088 420.073 167.632C419.979 168.177 419.462 168.541 418.917 168.447L410.049 166.912ZM447.576 140.817L410.796 166.744L409.644 165.11L446.424 139.183L447.576 140.817Z" fill="black"/>
<path d="M183.832 200H161.78C161.349 200 161 200.349 161 200.78V229.22C161 229.651 161.349 230 161.78 230H183.832C184.262 230 184.612 229.651 184.612 229.22V200.78C184.612 200.349 184.262 200 183.832 200Z" fill="white" stroke="black"/>
<path d="M164.049 204.447H181.418" stroke="black" stroke-linecap="round"/>
<path d="M168.405 207.496H181.417" stroke="black" stroke-linecap="round"/>
<path d="M164.049 207.496H166.753" stroke="black" stroke-linecap="round"/>
<path d="M164.049 210.4H181.418" stroke="black" stroke-linecap="round"/>
<path d="M167.534 213.303H181.417" stroke="black" stroke-linecap="round"/>
<path d="M164.049 213.303H165.882" stroke="black" stroke-linecap="round"/>
<path d="M166.227 216.207H171.69" stroke="black" stroke-linecap="round"/>
<path d="M164.049 216.207H164.721" stroke="black" stroke-linecap="round"/>
<path d="M173.196 216.207H181.417" stroke="black" stroke-linecap="round"/>
<path d="M164.049 219.111H181.418" stroke="black" stroke-linecap="round"/>
<path d="M164.049 222.015H181.418" stroke="black" stroke-linecap="round"/>
<path d="M164.049 224.918H181.418" stroke="black" stroke-linecap="round"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="162.098" y="186.176">Seed &#10;</tspan><tspan x="160.082" y="196.176">topic 3&#10;</tspan><tspan x="172" y="206.176">&#10;</tspan></text>
<path d="M242.832 200H220.78C220.349 200 220 200.349 220 200.78V229.22C220 229.651 220.349 230 220.78 230H242.832C243.262 230 243.612 229.651 243.612 229.22V200.78C243.612 200.349 243.262 200 242.832 200Z" fill="white" stroke="black"/>
<path d="M223.049 204.447H240.418" stroke="black" stroke-linecap="round"/>
<path d="M227.405 207.496H240.417" stroke="black" stroke-linecap="round"/>
<path d="M223.049 207.496H225.753" stroke="black" stroke-linecap="round"/>
<path d="M223.049 210.4H240.418" stroke="black" stroke-linecap="round"/>
<path d="M226.534 213.303H240.417" stroke="black" stroke-linecap="round"/>
<path d="M223.049 213.303H224.882" stroke="black" stroke-linecap="round"/>
<path d="M225.227 216.207H230.69" stroke="black" stroke-linecap="round"/>
<path d="M223.049 216.207H223.721" stroke="black" stroke-linecap="round"/>
<path d="M232.196 216.207H240.417" stroke="black" stroke-linecap="round"/>
<path d="M223.049 219.111H240.418" stroke="black" stroke-linecap="round"/>
<path d="M223.049 222.015H240.418" stroke="black" stroke-linecap="round"/>
<path d="M223.049 224.918H240.418" stroke="black" stroke-linecap="round"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="222.098" y="186.176">Seed &#10;</tspan><tspan x="220.082" y="196.176">topic 2&#10;</tspan><tspan x="232" y="206.176">&#10;</tspan></text>
<path d="M305.832 200H283.78C283.349 200 283 200.349 283 200.78V229.22C283 229.651 283.349 230 283.78 230H305.832C306.262 230 306.612 229.651 306.612 229.22V200.78C306.612 200.349 306.262 200 305.832 200Z" fill="white" stroke="black"/>
<path d="M286.049 204.447H303.418" stroke="black" stroke-linecap="round"/>
<path d="M290.405 207.496H303.417" stroke="black" stroke-linecap="round"/>
<path d="M286.049 207.496H288.753" stroke="black" stroke-linecap="round"/>
<path d="M286.049 210.4H303.418" stroke="black" stroke-linecap="round"/>
<path d="M289.534 213.303H303.417" stroke="black" stroke-linecap="round"/>
<path d="M286.049 213.303H287.882" stroke="black" stroke-linecap="round"/>
<path d="M288.227 216.207H293.69" stroke="black" stroke-linecap="round"/>
<path d="M286.049 216.207H286.721" stroke="black" stroke-linecap="round"/>
<path d="M295.196 216.207H303.417" stroke="black" stroke-linecap="round"/>
<path d="M286.049 219.111H303.418" stroke="black" stroke-linecap="round"/>
<path d="M286.049 222.015H303.418" stroke="black" stroke-linecap="round"/>
<path d="M286.049 224.918H303.418" stroke="black" stroke-linecap="round"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="272.965" y="186.176">No seed topic </tspan><tspan x="274.754" y="196.176">match found</tspan></text>
<path d="M374.832 200H352.78C352.349 200 352 200.349 352 200.78V229.22C352 229.651 352.349 230 352.78 230H374.832C375.262 230 375.612 229.651 375.612 229.22V200.78C375.612 200.349 375.262 200 374.832 200Z" fill="white" stroke="black"/>
<path d="M355.049 204.447H372.418" stroke="black" stroke-linecap="round"/>
<path d="M359.405 207.496H372.417" stroke="black" stroke-linecap="round"/>
<path d="M355.049 207.496H357.753" stroke="black" stroke-linecap="round"/>
<path d="M355.049 210.4H372.418" stroke="black" stroke-linecap="round"/>
<path d="M358.534 213.303H372.417" stroke="black" stroke-linecap="round"/>
<path d="M355.049 213.303H356.882" stroke="black" stroke-linecap="round"/>
<path d="M357.227 216.207H362.69" stroke="black" stroke-linecap="round"/>
<path d="M355.049 216.207H355.721" stroke="black" stroke-linecap="round"/>
<path d="M364.196 216.207H372.417" stroke="black" stroke-linecap="round"/>
<path d="M355.049 219.111H372.418" stroke="black" stroke-linecap="round"/>
<path d="M355.049 222.015H372.418" stroke="black" stroke-linecap="round"/>
<path d="M355.049 224.918H372.418" stroke="black" stroke-linecap="round"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="341.18" y="196.176">Seed topic 2&#10;</tspan><tspan x="363" y="206.176">&#10;</tspan></text>
<circle cx="320" cy="214" r="1" fill="black"/>
<circle cx="330" cy="214" r="1" fill="black"/>
<circle cx="340" cy="214" r="1" fill="black"/>
<rect x="268.5" y="307.5" width="14" height="14" stroke="black"/>
<rect x="282.5" y="307.5" width="14" height="14" stroke="black"/>
<rect x="296.5" y="307.5" width="14" height="14" stroke="black"/>
<rect x="310.5" y="307.5" width="14" height="14" stroke="black"/>
<rect x="324.5" y="307.5" width="14" height="14" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="284.809" y="293.176">seed topic &#10;</tspan><tspan x="284.441" y="303.176">embedding</tspan></text>
<rect x="180.5" y="307.5" width="14" height="14" stroke="black"/>
<rect x="194.5" y="307.5" width="14" height="14" stroke="black"/>
<rect x="208.5" y="307.5" width="14" height="14" stroke="black"/>
<rect x="222.5" y="307.5" width="14" height="14" stroke="black"/>
<rect x="236.5" y="307.5" width="14" height="14" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="197.254" y="292.176">document &#10;</tspan><tspan x="196.441" y="302.176">embedding</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="256" y="317.176">+</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="256" y="344.176">2</tspan></text>
<line x1="179" y1="328.5" x2="343" y2="328.5" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="379" y="462.764">Multiply the IDF values of the seeded </tspan><tspan x="379" y="476.764">keywords across all topics with 1.2.</tspan></text>
<path d="M263.293 392.707C263.683 393.098 264.317 393.098 264.707 392.707L271.071 386.343C271.462 385.953 271.462 385.319 271.071 384.929C270.681 384.538 270.047 384.538 269.657 384.929L264 390.586L258.343 384.929C257.953 384.538 257.319 384.538 256.929 384.929C256.538 385.319 256.538 385.953 256.929 386.343L263.293 392.707ZM263 367L263 392L265 392L265 367L263 367Z" fill="black"/>
<rect x="162.045" y="400" width="202.259" height="17.7695" fill="white"/>
<line x1="162.045" y1="417.815" x2="364.305" y2="417.815" stroke="#BDBDBD" stroke-width="2"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="9" font-weight="bold" letter-spacing="0em"><tspan x="178.247" y="414.299">Word</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="9" font-weight="bold" letter-spacing="0em"><tspan x="274.412" y="414.299">IDF</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="9" font-weight="bold" letter-spacing="0em"><tspan x="215.292" y="414.848">Multiplier</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="9" font-weight="bold" letter-spacing="0em"><tspan x="301.927" y="413.693">Adjusted IDF</tspan></text>
<rect x="162.045" y="418.815" width="202.259" height="17.7695" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="179.829" y="431.172">drug</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="217.264" y="431.694">1.2</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="275.276" y="431.694">.55</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="327.539" y="431.694">.66</tspan></text>
<rect x="162.045" y="436.062" width="202.259" height="17.7695" fill="#F5F5F5"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="217.264" y="448.419">1.2</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="173.649" y="448.419">doctor</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="275.276" y="448.941">.78</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="327.539" y="448.941">.94</tspan></text>
<rect x="162.045" y="453.831" width="202.259" height="17.7695" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="185.438" y="466.188">cat</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="224.053" y="466.711">1&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="275.276" y="466.188">.22</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="327.539" y="466.188">.22</tspan></text>
<rect x="162.045" y="471.601" width="202.259" height="17.7695" fill="#F5F5F5"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="224.053" y="483.958">1</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="182.812" y="484.176">dog</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="275.276" y="483.958">.11</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="327.539" y="483.958">.11</tspan></text>
<rect x="162.045" y="489.37" width="202.259" height="17.7695" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="175.915" y="501.727">space</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="217.264" y="502.25">1.2</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="275.276" y="502.772">.35</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="327.539" y="502.772">.42</tspan></text>
<rect x="162.045" y="507.14" width="202.259" height="17.7695" fill="#F5F5F5"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="217.264" y="519.497">1.2&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="172.907" y="519.497">launch</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="275.276" y="519.497">.89</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="323.172" y="519.497">1.07</tspan></text>
</svg>

After

Width:  |  Height:  |  Size: 20 KiB

@@ -0,0 +1,166 @@
<svg width="480" height="410" viewBox="0 0 480 410" fill="none" xmlns="http://www.w3.org/2000/svg">
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="248" y="49.7637">Create a distance matrix by calculating the </tspan><tspan x="248" y="63.7637">cosine similarity between c-TF-IDF </tspan><tspan x="248" y="77.7637">representations of each topic. </tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="250" y="200.764">Apply a linkage function of choice on the </tspan><tspan x="250" y="214.764">distance matrix to model the hierarchical </tspan><tspan x="250" y="228.764">structure of topics. </tspan></text>
<path d="M86.8315 310H64.7801C64.3493 310 64 310.349 64 310.78V339.22C64 339.651 64.3493 340 64.7801 340H86.8315C87.2624 340 87.6117 339.651 87.6117 339.22V310.78C87.6117 310.349 87.2624 310 86.8315 310Z" fill="white" stroke="black"/>
<path d="M84.8315 308H62.7801C62.3493 308 62 308.349 62 308.78V337.22C62 337.651 62.3493 338 62.7801 338H84.8315C85.2624 338 85.6117 337.651 85.6117 337.22V308.78C85.6117 308.349 85.2624 308 84.8315 308Z" fill="white" stroke="black"/>
<path d="M82.8315 306H60.7801C60.3493 306 60 306.349 60 306.78V335.22C60 335.651 60.3493 336 60.7801 336H82.8315C83.2624 336 83.6117 335.651 83.6117 335.22V306.78C83.6117 306.349 83.2624 306 82.8315 306Z" fill="white" stroke="black"/>
<path d="M63.049 310.447H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M67.4046 313.496H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M63.049 313.496H65.7534" stroke="black" stroke-linecap="round"/>
<path d="M63.049 316.399H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M66.5335 319.303H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M63.049 319.303H64.8823" stroke="black" stroke-linecap="round"/>
<path d="M65.2268 322.207H70.6898" stroke="black" stroke-linecap="round"/>
<path d="M63.049 322.207H63.7208" stroke="black" stroke-linecap="round"/>
<path d="M72.1959 322.207H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M63.049 325.111H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M63.049 328.015H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M63.049 330.918H80.4175" stroke="black" stroke-linecap="round"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="57.9023" y="301.176">Topic 26</tspan></text>
<path d="M36.8315 310H14.7801C14.3493 310 14 310.349 14 310.78V339.22C14 339.651 14.3493 340 14.7801 340H36.8315C37.2624 340 37.6117 339.651 37.6117 339.22V310.78C37.6117 310.349 37.2624 310 36.8315 310Z" fill="white" stroke="black"/>
<path d="M34.8315 308H12.7801C12.3493 308 12 308.349 12 308.78V337.22C12 337.651 12.3493 338 12.7801 338H34.8315C35.2624 338 35.6117 337.651 35.6117 337.22V308.78C35.6117 308.349 35.2624 308 34.8315 308Z" fill="white" stroke="black"/>
<path d="M32.8315 306H10.7801C10.3493 306 10 306.349 10 306.78V335.22C10 335.651 10.3493 336 10.7801 336H32.8315C33.2624 336 33.6117 335.651 33.6117 335.22V306.78C33.6117 306.349 33.2624 306 32.8315 306Z" fill="white" stroke="black"/>
<path d="M13.049 310.447H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M17.4046 313.496H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M13.049 313.496H15.7534" stroke="black" stroke-linecap="round"/>
<path d="M13.049 316.399H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M16.5335 319.303H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M13.049 319.303H14.8823" stroke="black" stroke-linecap="round"/>
<path d="M15.2268 322.207H20.6898" stroke="black" stroke-linecap="round"/>
<path d="M13.049 322.207H13.7208" stroke="black" stroke-linecap="round"/>
<path d="M22.1959 322.207H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M13.049 325.111H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M13.049 328.015H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M13.049 330.918H30.4175" stroke="black" stroke-linecap="round"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="10.0859" y="301.176">Topic 1</tspan></text>
<path d="M36.8315 368H14.7801C14.3493 368 14 368.349 14 368.78V397.22C14 397.651 14.3493 398 14.7801 398H36.8315C37.2624 398 37.6117 397.651 37.6117 397.22V368.78C37.6117 368.349 37.2624 368 36.8315 368Z" fill="white" stroke="black"/>
<path d="M34.8315 366H12.7801C12.3493 366 12 366.349 12 366.78V395.22C12 395.651 12.3493 396 12.7801 396H34.8315C35.2624 396 35.6117 395.651 35.6117 395.22V366.78C35.6117 366.349 35.2624 366 34.8315 366Z" fill="white" stroke="black"/>
<path d="M32.8315 364H10.7801C10.3493 364 10 364.349 10 364.78V393.22C10 393.651 10.3493 394 10.7801 394H32.8315C33.2624 394 33.6117 393.651 33.6117 393.22V364.78C33.6117 364.349 33.2624 364 32.8315 364Z" fill="white" stroke="black"/>
<path d="M13.049 368.447H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M17.4046 371.496H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M13.049 371.496H15.7534" stroke="black" stroke-linecap="round"/>
<path d="M13.049 374.399H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M16.5335 377.303H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M13.049 377.303H14.8823" stroke="black" stroke-linecap="round"/>
<path d="M15.2268 380.207H20.6898" stroke="black" stroke-linecap="round"/>
<path d="M13.049 380.207H13.7208" stroke="black" stroke-linecap="round"/>
<path d="M22.1959 380.207H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M13.049 383.111H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M13.049 386.015H30.4175" stroke="black" stroke-linecap="round"/>
<path d="M13.049 388.918H30.4175" stroke="black" stroke-linecap="round"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="7.90234" y="359.176">Topic 38</tspan></text>
<path d="M86.8315 368H64.7801C64.3493 368 64 368.349 64 368.78V397.22C64 397.651 64.3493 398 64.7801 398H86.8315C87.2624 398 87.6117 397.651 87.6117 397.22V368.78C87.6117 368.349 87.2624 368 86.8315 368Z" fill="white" stroke="black"/>
<path d="M84.8315 366H62.7801C62.3493 366 62 366.349 62 366.78V395.22C62 395.651 62.3493 396 62.7801 396H84.8315C85.2624 396 85.6117 395.651 85.6117 395.22V366.78C85.6117 366.349 85.2624 366 84.8315 366Z" fill="white" stroke="black"/>
<path d="M82.8315 364H60.7801C60.3493 364 60 364.349 60 364.78V393.22C60 393.651 60.3493 394 60.7801 394H82.8315C83.2624 394 83.6117 393.651 83.6117 393.22V364.78C83.6117 364.349 83.2624 364 82.8315 364Z" fill="white" stroke="black"/>
<path d="M63.049 368.447H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M67.4046 371.496H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M63.049 371.496H65.7534" stroke="black" stroke-linecap="round"/>
<path d="M63.049 374.399H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M66.5335 377.303H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M63.049 377.303H64.8823" stroke="black" stroke-linecap="round"/>
<path d="M65.2268 380.207H70.6898" stroke="black" stroke-linecap="round"/>
<path d="M63.049 380.207H63.7208" stroke="black" stroke-linecap="round"/>
<path d="M72.1959 380.207H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M63.049 383.111H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M63.049 386.015H80.4175" stroke="black" stroke-linecap="round"/>
<path d="M63.049 388.918H80.4175" stroke="black" stroke-linecap="round"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="57.9023" y="359.176">Topic 42</tspan></text>
<rect x="134.5" y="342.5" width="14" height="14" stroke="black"/>
<rect x="148.5" y="342.5" width="14" height="14" stroke="black"/>
<rect x="162.5" y="342.5" width="14" height="14" stroke="black"/>
<rect x="176.5" y="342.5" width="14" height="14" stroke="black"/>
<rect x="190.5" y="342.5" width="14" height="14" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="133.379" y="336.176">re-calculate c-TF-IDF</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="234" y="336.764">Update the c-TF-IDF representation </tspan><tspan x="234" y="350.764">based on the collection of documents </tspan><tspan x="234" y="364.764">across the merged topics. &#10;</tspan></text>
<line x1="87" y1="289.5" x2="107" y2="289.5" stroke="black"/>
<line x1="87" y1="409.5" x2="107" y2="409.5" stroke="black"/>
<line x1="108" y1="349.5" x2="128" y2="349.5" stroke="black"/>
<line x1="107.5" y1="289" x2="107.5" y2="410" stroke="black"/>
<rect x="127.5" y="38.5" width="19" height="19" stroke="black"/>
<rect x="146.5" y="38.5" width="19" height="19" stroke="black"/>
<rect x="165.5" y="38.5" width="19" height="19" stroke="black"/>
<rect x="184.5" y="38.5" width="19" height="19" stroke="black"/>
<rect x="203.5" y="38.5" width="19" height="19" stroke="black"/>
<rect x="127.5" y="57.5" width="19" height="19" stroke="black"/>
<rect x="146.5" y="57.5" width="19" height="19" stroke="black"/>
<rect x="165.5" y="57.5" width="19" height="19" stroke="black"/>
<rect x="184.5" y="57.5" width="19" height="19" stroke="black"/>
<rect x="203.5" y="57.5" width="19" height="19" stroke="black"/>
<rect x="127.5" y="76.5" width="19" height="19" stroke="black"/>
<rect x="146.5" y="76.5" width="19" height="19" stroke="black"/>
<rect x="165.5" y="76.5" width="19" height="19" stroke="black"/>
<rect x="184.5" y="76.5" width="19" height="19" stroke="black"/>
<rect x="203.5" y="76.5" width="19" height="19" stroke="black"/>
<rect x="127.5" y="95.5" width="19" height="19" stroke="black"/>
<rect x="146.5" y="95.5" width="19" height="19" stroke="black"/>
<rect x="165.5" y="95.5" width="19" height="19" stroke="black"/>
<rect x="184.5" y="95.5" width="19" height="19" stroke="black"/>
<rect x="203.5" y="95.5" width="19" height="19" stroke="black"/>
<rect x="127.5" y="19.5" width="19" height="19" stroke="black"/>
<rect x="146.5" y="19.5" width="19" height="19" stroke="black"/>
<rect x="165.5" y="19.5" width="19" height="19" stroke="black"/>
<rect x="184.5" y="19.5" width="19" height="19" stroke="black"/>
<rect x="203.5" y="19.5" width="19" height="19" stroke="black"/>
<rect x="127.5" y="0.5" width="19" height="19" fill="black" stroke="black"/>
<rect x="146.5" y="0.5" width="19" height="19" fill="black" stroke="black"/>
<rect x="165.5" y="0.5" width="19" height="19" fill="black" stroke="black"/>
<rect x="184.5" y="0.5" width="19" height="19" fill="black" stroke="black"/>
<rect x="203.5" y="0.5" width="19" height="19" fill="black" stroke="black"/>
<rect x="127.5" y="19.5" width="19" height="19" transform="rotate(90 127.5 19.5)" fill="black" stroke="black"/>
<rect x="127.5" y="38.5" width="19" height="19" transform="rotate(90 127.5 38.5)" fill="black" stroke="black"/>
<rect x="127.5" y="57.5" width="19" height="19" transform="rotate(90 127.5 57.5)" fill="black" stroke="black"/>
<rect x="127.5" y="76.5" width="19" height="19" transform="rotate(90 127.5 76.5)" fill="black" stroke="black"/>
<rect x="127.5" y="95.5" width="19" height="19" transform="rotate(90 127.5 95.5)" fill="black" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="103.52" y="13.1758">Topic</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="134.816" y="33.1758">1</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="149.922" y="33.1758">.12</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="131.922" y="51.1758">.12</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="168.922" y="33.1758">.53</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="131.922" y="71.1758">.53</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="168.922" y="51.1758">.74</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="149.922" y="71.1758">.74</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="206.922" y="51.1758">.89</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="149.922" y="109.176">.89</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="206.922" y="32.1758">.24</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="130.922" y="109.176">.24</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="206.922" y="69.1758">.01</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="169.922" y="109.176">.01</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="153.816" y="51.1758">1</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="172.816" y="70.1758">1</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="191.816" y="89.1758">1</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="210.816" y="108.176">1</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="209.367" y="89.1758">...</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="171.367" y="89.1758">...</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="151.367" y="89.1758">...</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="132.367" y="89.1758">...</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="190.367" y="69.1758">...</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="190.367" y="49.1758">...</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="190.367" y="30.1758">...</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="190.367" y="107.176">...</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="115.816" y="33.1758">1</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="115.816" y="52.1758">2</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="115.816" y="70.1758">3</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="134.816" y="13.1758">1</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="152.816" y="13.1758">2</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="172.816" y="13.1758">3</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="209.77" y="13.1758">n</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="190.367" y="13.1758">...</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="116.789" y="83.1758">.</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="116.789" y="87.1758">.</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="116.789" y="91.1758">.</tspan></text>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="8" letter-spacing="0em"><tspan x="115.77" y="108.176">n</tspan></text>
<path d="M173.293 165.707C173.683 166.098 174.317 166.098 174.707 165.707L181.071 159.343C181.462 158.953 181.462 158.319 181.071 157.929C180.681 157.538 180.047 157.538 179.657 157.929L174 163.586L168.343 157.929C167.953 157.538 167.319 157.538 166.929 157.929C166.538 158.319 166.538 158.953 166.929 159.343L173.293 165.707ZM173 127L173 165L175 165L175 127L173 127Z" fill="black"/>
<path d="M173.293 305.707C173.683 306.098 174.317 306.098 174.707 305.707L181.071 299.343C181.462 298.953 181.462 298.319 181.071 297.929C180.681 297.538 180.047 297.538 179.657 297.929L174 303.586L168.343 297.929C167.953 297.538 167.319 297.538 166.929 297.929C166.538 298.319 166.538 298.953 166.929 299.343L173.293 305.707ZM173 267L173 305L175 305L175 267L173 267Z" fill="black"/>
<circle cx="132" cy="185" r="3.5" stroke="black"/>
<circle cx="192" cy="205" r="3.5" stroke="black"/>
<circle cx="202" cy="185" r="3.5" stroke="black"/>
<circle cx="202" cy="235" r="3.5" stroke="black"/>
<circle cx="181" cy="233" r="3.5" stroke="black"/>
<circle cx="137" cy="213" r="3.5" stroke="black"/>
<circle cx="158" cy="227" r="3.5" stroke="black"/>
<circle cx="122" cy="233" r="3.5" stroke="black"/>
<circle cx="162" cy="194" r="3.5" stroke="black"/>
<line x1="135.14" y1="185.52" x2="159.14" y2="192.52" stroke="black"/>
<line x1="184.066" y1="233.504" x2="199.066" y2="235.504" stroke="black"/>
<line x1="192.553" y1="201.776" x2="199.553" y2="187.776" stroke="black"/>
</svg>

After

Width:  |  Height:  |  Size: 19 KiB

File diff suppressed because one or more lines are too long
@@ -0,0 +1,364 @@
When tweaking your topic model, the number of topics that are generated has a large effect on the quality of the topic representations. Some topics could be merged and having an understanding of the effect will help you understand which topics should and which should not be merged.
That is where hierarchical topic modeling comes in. It tries to model the possible hierarchical nature of the topics you have created to understand which topics are similar to each other. Moreover, you will have more insight into sub-topics that might exist in your data.
<br>
<div class="svg_image">
--8<-- "docs/getting_started/hierarchicaltopics/hierarchical.svg"
</div>
<br>
In BERTopic, we can approximate this potential hierarchy by making use of our topic-term matrix (c-TF-IDF matrix). This matrix contains information about the importance of every word in every topic and makes for a nice numerical representation of our topics. The smaller the distance between two c-TF-IDF representations, the more similar we assume they are. In practice, this process of merging topics is done through the hierarchical clustering capabilities of `scipy` (see [here](https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html)). It allows for several linkage methods through which we can approximate our topic hierarchy. As a default, we are using the [ward](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.ward.html#scipy.cluster.hierarchy.ward) but many others are available.
Whenever we merge two topics, we can calculate the c-TF-IDF representation of these two merged by summing their bag-of-words representation. We assume that two sets of topics are merged and that all others are kept the same, regardless of their location in the hierarchy. This helps us isolate the potential effect of merging sets of topics. As a result, we can see the topic representation at each level in the tree.
## **Example**
To demonstrate hierarchical topic modeling with BERTopic, we use the 20 Newsgroups dataset to see how the topics that we uncover are represented in the 20 categories of documents.
First, we train a basic BERTopic model:
```python
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))["data"]
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(docs)
```
Next, we can use our fitted BERTopic model to extract possible hierarchies from our c-TF-IDF matrix:
```python
hierarchical_topics = topic_model.hierarchical_topics(docs)
```
The resulting `hierarchical_topics` is a dataframe in which merged topics are described. For example, if you would
merge two topics, what would the topic representation of the new topic be?
## **Linkage functions**
When creating the potential hierarchical nature of topics, we use Scipy's ward `linkage` function as a default
to generate the hierarchy. However, you might want to use a [different linkage function](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html)
for your use case, such as `single`, `complete`, `average`, `centroid`, or `median`. In BERTopic, you can define the
linkage function yourself, including the distance function that you would like to use:
```python
from scipy.cluster import hierarchy as sch
from bertopic import BERTopic
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)
# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)
```
## **Visualizations**
To visualize these results, we can start by running a familiar function, namely `topic_model.visualize_hierarchy`:
```python
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
```
<iframe src="hierarchical_topics.html" style="width:1000px; height: 2150px; border: 0px;""></iframe>
If you **hover** over the black circles, you will see the topic representation at that level of the hierarchy. These representations
help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover,
we can now see which sub-topics can be found within certain larger themes.
Although this gives a nice overview of the potential hierarchy, hovering over all black circles can be tiresome. Instead, we can
use `topic_model.get_topic_tree` to create a text-based representation of this hierarchy. Although the general structure is more difficult
to view, we can see better which topics could be logically merged:
```python
>>> tree = topic_model.get_topic_tree(hierarchical_topics)
>>> print(tree)
.
atheists_atheism_god_moral_atheist
atheists_atheism_god_atheist_argument
atheists_atheism_god_atheist_argument Topic: 21
br_god_exist_genetic_existence Topic: 124
moral_morality_objective_immoral_morals Topic: 29
```
<details>
<summary>Click here to view the full tree.</summary>
```bash
.
├─people_armenian_said_god_armenians
│ ├─god_jesus_jehovah_lord_christ
│ │ ├─god_jesus_jehovah_lord_christ
│ │ │ ├─jehovah_lord_mormon_mcconkie_god
│ │ │ │ ├─■──ra_satan_thou_god_lucifer ── Topic: 94
│ │ │ │ └─■──jehovah_lord_mormon_mcconkie_unto ── Topic: 78
│ │ │ └─jesus_mary_god_hell_sin
│ │ │ ├─jesus_hell_god_eternal_heaven
│ │ │ │ ├─hell_jesus_eternal_god_heaven
│ │ │ │ │ ├─■──jesus_tomb_disciples_resurrection_john ── Topic: 69
│ │ │ │ │ └─■──hell_eternal_god_jesus_heaven ── Topic: 53
│ │ │ │ └─■──aaron_baptism_sin_law_god ── Topic: 89
│ │ │ └─■──mary_sin_maria_priest_conception ── Topic: 56
│ │ └─■──marriage_married_marry_ceremony_marriages ── Topic: 110
│ └─people_armenian_armenians_said_mr
│ ├─people_armenian_armenians_said_israel
│ │ ├─god_homosexual_homosexuality_atheists_sex
│ │ │ ├─homosexual_homosexuality_sex_gay_homosexuals
│ │ │ │ ├─■──kinsey_sex_gay_men_sexual ── Topic: 44
│ │ │ │ └─homosexuality_homosexual_sin_homosexuals_gay
│ │ │ │ ├─■──gay_homosexual_homosexuals_sexual_cramer ── Topic: 50
│ │ │ │ └─■──homosexuality_homosexual_sin_paul_sex ── Topic: 27
│ │ │ └─god_atheists_atheism_moral_atheist
│ │ │ ├─islam_quran_judas_islamic_book
│ │ │ │ ├─■──jim_context_challenges_articles_quote ── Topic: 36
│ │ │ │ └─islam_quran_judas_islamic_book
│ │ │ │ ├─■──islam_quran_islamic_rushdie_muslims ── Topic: 31
│ │ │ │ └─■──judas_scripture_bible_books_greek ── Topic: 33
│ │ │ └─atheists_atheism_god_moral_atheist
│ │ │ ├─atheists_atheism_god_atheist_argument
│ │ │ │ ├─■──atheists_atheism_god_atheist_argument ── Topic: 21
│ │ │ │ └─■──br_god_exist_genetic_existence ── Topic: 124
│ │ │ └─■──moral_morality_objective_immoral_morals ── Topic: 29
│ │ └─armenian_armenians_people_israel_said
│ │ ├─armenian_armenians_israel_people_jews
│ │ │ ├─tax_rights_government_income_taxes
│ │ │ │ ├─■──rights_right_slavery_slaves_residence ── Topic: 106
│ │ │ │ └─tax_government_taxes_income_libertarians
│ │ │ │ ├─■──government_libertarians_libertarian_regulation_party ── Topic: 58
│ │ │ │ └─■──tax_taxes_income_billion_deficit ── Topic: 41
│ │ │ └─armenian_armenians_israel_people_jews
│ │ │ ├─gun_guns_militia_firearms_amendment
│ │ │ │ ├─■──blacks_penalty_death_cruel_punishment ── Topic: 55
│ │ │ │ └─■──gun_guns_militia_firearms_amendment ── Topic: 7
│ │ │ └─armenian_armenians_israel_jews_turkish
│ │ │ ├─■──israel_israeli_jews_arab_jewish ── Topic: 4
│ │ │ └─■──armenian_armenians_turkish_armenia_azerbaijan ── Topic: 15
│ │ └─stephanopoulos_president_mr_myers_ms
│ │ ├─■──serbs_muslims_stephanopoulos_mr_bosnia ── Topic: 35
│ │ └─■──myers_stephanopoulos_president_ms_mr ── Topic: 87
│ └─batf_fbi_koresh_compound_gas
│ ├─■──reno_workers_janet_clinton_waco ── Topic: 77
│ └─batf_fbi_koresh_gas_compound
│ ├─batf_koresh_fbi_warrant_compound
│ │ ├─■──batf_warrant_raid_compound_fbi ── Topic: 42
│ │ └─■──koresh_batf_fbi_children_compound ── Topic: 61
│ └─■──fbi_gas_tear_bds_building ── Topic: 23
└─use_like_just_dont_new
├─game_team_year_games_like
│ ├─game_team_games_25_year
│ │ ├─game_team_games_25_season
│ │ │ ├─window_printer_use_problem_mhz
│ │ │ │ ├─mhz_wire_simms_wiring_battery
│ │ │ │ │ ├─simms_mhz_battery_cpu_heat
│ │ │ │ │ │ ├─simms_pds_simm_vram_lc
│ │ │ │ │ │ │ ├─■──pds_nubus_lc_slot_card ── Topic: 119
│ │ │ │ │ │ │ └─■──simms_simm_vram_meg_dram ── Topic: 32
│ │ │ │ │ │ └─mhz_battery_cpu_heat_speed
│ │ │ │ │ │ ├─mhz_cpu_speed_heat_fan
│ │ │ │ │ │ │ ├─mhz_cpu_speed_heat_fan
│ │ │ │ │ │ │ │ ├─■──fan_cpu_heat_sink_fans ── Topic: 92
│ │ │ │ │ │ │ │ └─■──mhz_speed_cpu_fpu_clock ── Topic: 22
│ │ │ │ │ │ │ └─■──monitor_turn_power_computer_electricity ── Topic: 91
│ │ │ │ │ │ └─battery_batteries_concrete_duo_discharge
│ │ │ │ │ │ ├─■──duo_battery_apple_230_problem ── Topic: 121
│ │ │ │ │ │ └─■──battery_batteries_concrete_discharge_temperature ── Topic: 75
│ │ │ │ │ └─wire_wiring_ground_neutral_outlets
│ │ │ │ │ ├─wire_wiring_ground_neutral_outlets
│ │ │ │ │ │ ├─wire_wiring_ground_neutral_outlets
│ │ │ │ │ │ │ ├─■──leds_uv_blue_light_boards ── Topic: 66
│ │ │ │ │ │ │ └─■──wire_wiring_ground_neutral_outlets ── Topic: 120
│ │ │ │ │ │ └─scope_scopes_phone_dial_number
│ │ │ │ │ │ ├─■──dial_number_phone_line_output ── Topic: 93
│ │ │ │ │ │ └─■──scope_scopes_motorola_generator_oscilloscope ── Topic: 113
│ │ │ │ │ └─celp_dsp_sampling_antenna_digital
│ │ │ │ │ ├─■──antenna_antennas_receiver_cable_transmitter ── Topic: 70
│ │ │ │ │ └─■──celp_dsp_sampling_speech_voice ── Topic: 52
│ │ │ │ └─window_printer_xv_mouse_windows
│ │ │ │ ├─window_xv_error_widget_problem
│ │ │ │ │ ├─error_symbol_undefined_xterm_rx
│ │ │ │ │ │ ├─■──symbol_error_undefined_doug_parse ── Topic: 63
│ │ │ │ │ │ └─■──rx_remote_server_xdm_xterm ── Topic: 45
│ │ │ │ │ └─window_xv_widget_application_expose
│ │ │ │ │ ├─window_widget_expose_application_event
│ │ │ │ │ │ ├─■──gc_mydisplay_draw_gxxor_drawing ── Topic: 103
│ │ │ │ │ │ └─■──window_widget_application_expose_event ── Topic: 25
│ │ │ │ │ └─xv_den_polygon_points_algorithm
│ │ │ │ │ ├─■──den_polygon_points_algorithm_polygons ── Topic: 28
│ │ │ │ │ └─■──xv_24bit_image_bit_images ── Topic: 57
│ │ │ │ └─printer_fonts_print_mouse_postscript
│ │ │ │ ├─printer_fonts_print_font_deskjet
│ │ │ │ │ ├─■──scanner_logitech_grayscale_ocr_scanman ── Topic: 108
│ │ │ │ │ └─printer_fonts_print_font_deskjet
│ │ │ │ │ ├─■──printer_print_deskjet_hp_ink ── Topic: 18
│ │ │ │ │ └─■──fonts_font_truetype_tt_atm ── Topic: 49
│ │ │ │ └─mouse_ghostscript_midi_driver_postscript
│ │ │ │ ├─ghostscript_midi_postscript_files_file
│ │ │ │ │ ├─■──ghostscript_postscript_pageview_ghostview_dsc ── Topic: 104
│ │ │ │ │ └─midi_sound_file_windows_driver
│ │ │ │ │ ├─■──location_mar_file_host_rwrr ── Topic: 83
│ │ │ │ │ └─■──midi_sound_driver_blaster_soundblaster ── Topic: 98
│ │ │ │ └─■──mouse_driver_mice_ball_problem ── Topic: 68
│ │ │ └─game_team_games_25_season
│ │ │ ├─1st_sale_condition_comics_hulk
│ │ │ │ ├─sale_condition_offer_asking_cd
│ │ │ │ │ ├─condition_stereo_amp_speakers_asking
│ │ │ │ │ │ ├─■──miles_car_amfm_toyota_cassette ── Topic: 62
│ │ │ │ │ │ └─■──amp_speakers_condition_stereo_audio ── Topic: 24
│ │ │ │ │ └─games_sale_pom_cds_shipping
│ │ │ │ │ ├─pom_cds_sale_shipping_cd
│ │ │ │ │ │ ├─■──size_shipping_sale_condition_mattress ── Topic: 100
│ │ │ │ │ │ └─■──pom_cds_cd_sale_picture ── Topic: 37
│ │ │ │ │ └─■──games_game_snes_sega_genesis ── Topic: 40
│ │ │ │ └─1st_hulk_comics_art_appears
│ │ │ │ ├─1st_hulk_comics_art_appears
│ │ │ │ │ ├─lens_tape_camera_backup_lenses
│ │ │ │ │ │ ├─■──tape_backup_tapes_drive_4mm ── Topic: 107
│ │ │ │ │ │ └─■──lens_camera_lenses_zoom_pouch ── Topic: 114
│ │ │ │ │ └─1st_hulk_comics_art_appears
│ │ │ │ │ ├─■──1st_hulk_comics_art_appears ── Topic: 105
│ │ │ │ │ └─■──books_book_cover_trek_chemistry ── Topic: 125
│ │ │ │ └─tickets_hotel_ticket_voucher_package
│ │ │ │ ├─■──hotel_voucher_package_vacation_room ── Topic: 74
│ │ │ │ └─■──tickets_ticket_june_airlines_july ── Topic: 84
│ │ │ └─game_team_games_season_hockey
│ │ │ ├─game_hockey_team_25_550
│ │ │ │ ├─■──espn_pt_pts_game_la ── Topic: 17
│ │ │ │ └─■──team_25_game_hockey_550 ── Topic: 2
│ │ │ └─■──year_game_hit_baseball_players ── Topic: 0
│ │ └─bike_car_greek_insurance_msg
│ │ ├─car_bike_insurance_cars_engine
│ │ │ ├─car_insurance_cars_radar_engine
│ │ │ │ ├─insurance_health_private_care_canada
│ │ │ │ │ ├─■──insurance_health_private_care_canada ── Topic: 99
│ │ │ │ │ └─■──insurance_car_accident_rates_sue ── Topic: 82
│ │ │ │ └─car_cars_radar_engine_detector
│ │ │ │ ├─car_radar_cars_detector_engine
│ │ │ │ │ ├─■──radar_detector_detectors_ka_alarm ── Topic: 39
│ │ │ │ │ └─car_cars_mustang_ford_engine
│ │ │ │ │ ├─■──clutch_shift_shifting_transmission_gear ── Topic: 88
│ │ │ │ │ └─■──car_cars_mustang_ford_v8 ── Topic: 14
│ │ │ │ └─oil_diesel_odometer_diesels_car
│ │ │ │ ├─odometer_oil_sensor_car_drain
│ │ │ │ │ ├─■──odometer_sensor_speedo_gauge_mileage ── Topic: 96
│ │ │ │ │ └─■──oil_drain_car_leaks_taillights ── Topic: 102
│ │ │ │ └─■──diesel_diesels_emissions_fuel_oil ── Topic: 79
│ │ │ └─bike_riding_ride_bikes_motorcycle
│ │ │ ├─bike_ride_riding_bikes_lane
│ │ │ │ ├─■──bike_ride_riding_lane_car ── Topic: 11
│ │ │ │ └─■──bike_bikes_miles_honda_motorcycle ── Topic: 19
│ │ │ └─■──countersteering_bike_motorcycle_rear_shaft ── Topic: 46
│ │ └─greek_msg_kuwait_greece_water
│ │ ├─greek_msg_kuwait_greece_water
│ │ │ ├─greek_msg_kuwait_greece_dog
│ │ │ │ ├─greek_msg_kuwait_greece_dog
│ │ │ │ │ ├─greek_kuwait_greece_turkish_greeks
│ │ │ │ │ │ ├─■──greek_greece_turkish_greeks_cyprus ── Topic: 71
│ │ │ │ │ │ └─■──kuwait_iraq_iran_gulf_arabia ── Topic: 76
│ │ │ │ │ └─msg_dog_drugs_drug_food
│ │ │ │ │ ├─dog_dogs_cooper_trial_weaver
│ │ │ │ │ │ ├─■──clinton_bush_quayle_reagan_panicking ── Topic: 101
│ │ │ │ │ │ └─dog_dogs_cooper_trial_weaver
│ │ │ │ │ │ ├─■──cooper_trial_weaver_spence_witnesses ── Topic: 90
│ │ │ │ │ │ └─■──dog_dogs_bike_trained_springer ── Topic: 67
│ │ │ │ │ └─msg_drugs_drug_food_chinese
│ │ │ │ │ ├─■──msg_food_chinese_foods_taste ── Topic: 30
│ │ │ │ │ └─■──drugs_drug_marijuana_cocaine_alcohol ── Topic: 72
│ │ │ │ └─water_theory_universe_science_larsons
│ │ │ │ ├─water_nuclear_cooling_steam_dept
│ │ │ │ │ ├─■──rocketry_rockets_engines_nuclear_plutonium ── Topic: 115
│ │ │ │ │ └─water_cooling_steam_dept_plants
│ │ │ │ │ ├─■──water_dept_phd_environmental_atmospheric ── Topic: 97
│ │ │ │ │ └─■──cooling_water_steam_towers_plants ── Topic: 109
│ │ │ │ └─theory_universe_larsons_larson_science
│ │ │ │ ├─■──theory_universe_larsons_larson_science ── Topic: 54
│ │ │ │ └─■──oort_cloud_grbs_gamma_burst ── Topic: 80
│ │ │ └─helmet_kirlian_photography_lock_wax
│ │ │ ├─helmet_kirlian_photography_leaf_mask
│ │ │ │ ├─kirlian_photography_leaf_pictures_deleted
│ │ │ │ │ ├─deleted_joke_stuff_maddi_nickname
│ │ │ │ │ │ ├─■──joke_maddi_nickname_nicknames_frank ── Topic: 43
│ │ │ │ │ │ └─■──deleted_stuff_bookstore_joke_motto ── Topic: 81
│ │ │ │ │ └─■──kirlian_photography_leaf_pictures_aura ── Topic: 85
│ │ │ │ └─helmet_mask_liner_foam_cb
│ │ │ │ ├─■──helmet_liner_foam_cb_helmets ── Topic: 112
│ │ │ │ └─■──mask_goalies_77_santore_tl ── Topic: 123
│ │ │ └─lock_wax_paint_plastic_ear
│ │ │ ├─■──lock_cable_locks_bike_600 ── Topic: 117
│ │ │ └─wax_paint_ear_plastic_skin
│ │ │ ├─■──wax_paint_plastic_scratches_solvent ── Topic: 65
│ │ │ └─■──ear_wax_skin_greasy_acne ── Topic: 116
│ │ └─m4_mp_14_mw_mo
│ │ ├─m4_mp_14_mw_mo
│ │ │ ├─■──m4_mp_14_mw_mo ── Topic: 111
│ │ │ └─■──test_ensign_nameless_deane_deanebinahccbrandeisedu ── Topic: 118
│ │ └─■──ites_cheek_hello_hi_ken ── Topic: 3
│ └─space_medical_health_disease_cancer
│ ├─medical_health_disease_cancer_patients
│ │ ├─■──cancer_centers_center_medical_research ── Topic: 122
│ │ └─health_medical_disease_patients_hiv
│ │ ├─patients_medical_disease_candida_health
│ │ │ ├─■──candida_yeast_infection_gonorrhea_infections ── Topic: 48
│ │ │ └─patients_disease_cancer_medical_doctor
│ │ │ ├─■──hiv_medical_cancer_patients_doctor ── Topic: 34
│ │ │ └─■──pain_drug_patients_disease_diet ── Topic: 26
│ │ └─■──health_newsgroup_tobacco_vote_votes ── Topic: 9
│ └─space_launch_nasa_shuttle_orbit
│ ├─space_moon_station_nasa_launch
│ │ ├─■──sky_advertising_billboard_billboards_space ── Topic: 59
│ │ └─■──space_station_moon_redesign_nasa ── Topic: 16
│ └─space_mission_hst_launch_orbit
│ ├─space_launch_nasa_orbit_propulsion
│ │ ├─■──space_launch_nasa_propulsion_astronaut ── Topic: 47
│ │ └─■──orbit_km_jupiter_probe_earth ── Topic: 86
│ └─■──hst_mission_shuttle_orbit_arrays ── Topic: 60
└─drive_file_key_windows_use
├─key_file_jpeg_encryption_image
│ ├─key_encryption_clipper_chip_keys
│ │ ├─■──key_clipper_encryption_chip_keys ── Topic: 1
│ │ └─■──entry_file_ripem_entries_key ── Topic: 73
│ └─jpeg_image_file_gif_images
│ ├─motif_graphics_ftp_available_3d
│ │ ├─motif_graphics_openwindows_ftp_available
│ │ │ ├─■──openwindows_motif_xview_windows_mouse ── Topic: 20
│ │ │ └─■──graphics_widget_ray_3d_available ── Topic: 95
│ │ └─■──3d_machines_version_comments_contact ── Topic: 38
│ └─jpeg_image_gif_images_format
│ ├─■──gopher_ftp_files_stuffit_images ── Topic: 51
│ └─■──jpeg_image_gif_format_images ── Topic: 13
└─drive_db_card_scsi_windows
├─db_windows_dos_mov_os2
│ ├─■──copy_protection_program_software_disk ── Topic: 64
│ └─■──db_windows_dos_mov_os2 ── Topic: 8
└─drive_card_scsi_drives_ide
├─drive_scsi_drives_ide_disk
│ ├─■──drive_scsi_drives_ide_disk ── Topic: 6
│ └─■──meg_sale_ram_drive_shipping ── Topic: 12
└─card_modem_monitor_video_drivers
├─■──card_monitor_video_drivers_vga ── Topic: 5
└─■──modem_port_serial_irq_com ── Topic: 10
```
</details>
## **Merge topics**
After seeing the potential hierarchy of your topic, you might want to merge specific
topics. For example, if topic 1 is
`1_space_launch_moon_nasa` and topic 2 is `2_spacecraft_solar_space_orbit` it might
make sense to merge those two topics as they are quite similar in meaning. In BERTopic,
you can use `.merge_topics` to manually select and merge those topics. Doing so will
update their topic representation which in turn updates the entire model:
```python
topics_to_merge = [1, 2]
topic_model.merge_topics(docs, topics_to_merge)
```
If you have several groups of topics you want to merge, create a list of lists instead:
```python
topics_to_merge = [[1, 2],
[3, 4]]
topic_model.merge_topics(docs, topics_to_merge)
```
@@ -0,0 +1,89 @@
Although topic modeling is typically done by discovering topics in an unsupervised manner, there might be times when you already have a bunch of clusters or classes from which you want to model the topics. For example, the often used [20 NewsGroups dataset](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) is already split up into 20 classes. Here, we might want to see how we can transform those 20 classes into 20 topics. Instead of using BERTopic to discover previously unknown topics, we are now going to manually pass them to BERTopic without actually learning them.
We can view this as a manual topic modeling approach. There is no underlying algorithm for detecting these topics since you already have done that before. Whether that is simply because they are already available, like with the 20 NewsGroups dataset, or maybe because you have created clusters of documents before using packages like [human-learn](https://github.com/koaning/human-learn), [bulk](https://github.com/koaning/bulk), [thisnotthat](https://github.com/TutteInstitute/thisnotthat) or something entirely different.
In other words, we can pass our labels to BERTopic and it will try to transform those labels into topics by running the c-TF-IDF representations on the set of documents within each label. This process allows us to model the topics themselves and similarly gives us the option to use everything BERTopic has to offer.
<br>
<div class="svg_image">
--8<-- "docs/getting_started/manual/pipeline.svg"
</div>
<br>
To do so, we need to skip over the dimensionality reduction and clustering steps since we already know the labels for our documents. We can use the documents and labels from the 20 NewsGroups dataset to create topics from those 20 labels:
```python
from sklearn.datasets import fetch_20newsgroups
# Get labeled data
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
docs = data['data']
y = data['target']
```
Then, we make sure to create empty instances of the dimensionality reduction and clustering steps. We pass those to BERTopic to simply skip over them and go to the topic representation process:
```python
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
# Prepare our empty sub-models and reduce frequent words while we are at it.
empty_embedding_model = BaseEmbedder()
empty_dimensionality_model = BaseDimensionalityReduction()
empty_cluster_model = BaseCluster()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
embedding_model=empty_embedding_model,
umap_model=empty_dimensionality_model,
hdbscan_model=empty_cluster_model,
ctfidf_model=ctfidf_model
)
topics, probs = topic_model.fit_transform(docs, y=y)
```
Let's take a look at a few topics that we get out of training this way by running `topic_model.get_topic_info()`:
<br>
<div class="svg_image">
--8<-- "docs/getting_started/manual/table.svg"
</div>
<br>
We can see several interesting topics appearing here. They seem to relate to the 20 classes we had as input. Now, let's map those topics to our original classes to view their relationship:
```python
# Map input `y` to topics
mappings = topic_model.topic_mapper_.get_mappings()
mappings = {value: data["target_names"][key] for key, value in mappings.items()}
# Assign original classes to our topics
df = topic_model.get_topic_info()
df["Class"] = df.Topic.map(mappings)
df
```
<br>
<div class="svg_image">
--8<-- "docs/getting_started/manual/table_classes.svg"
</div>
<br>
We can see that the c-TF-IDF representations nicely extract the words that give a nice representation of our input classes. This is all done without actually embedding and clustering the data.
As a result, the entire "training" process only takes a couple of seconds. Moreover, we can still perform BERTopic-specific features like dynamic topic modeling, topics per class, hierarchical topic modeling, modeling topic distributions, etc.
!!! note
The resulting `topics` may be a different mapping from the `y` labels. To map `y` to `topics`, we can run the following:
```python
mappings = topic_model.topic_mapper_.get_mappings()
y_mapped = [mappings[val] for val in y]
```
@@ -0,0 +1,12 @@
<svg width="293" height="164" viewBox="0 0 293 164" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="0.5" y="0.5" width="118" height="42" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="22" y="24.7637">Documents</tspan></text>
<rect x="53.5" y="121.5" width="65" height="42" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="65" y="145.764">Labels</tspan></text>
<rect x="201.5" y="60.5" width="91" height="42" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="223" y="84.7637">c-TF-IDF</tspan></text>
<path d="M246.293 52.7071C246.683 53.0976 247.317 53.0976 247.707 52.7071L254.071 46.3431C254.462 45.9526 254.462 45.3195 254.071 44.9289C253.681 44.5384 253.047 44.5384 252.657 44.9289L247 50.5858L241.343 44.9289C240.953 44.5384 240.319 44.5384 239.929 44.9289C239.538 45.3195 239.538 45.9526 239.929 46.3431L246.293 52.7071ZM246 21L246 52L248 52L248 21L246 21Z" fill="black"/>
<path d="M247.707 113.293C247.317 112.902 246.683 112.902 246.293 113.293L239.929 119.657C239.538 120.047 239.538 120.681 239.929 121.071C240.319 121.462 240.953 121.462 241.343 121.071L247 115.414L252.657 121.071C253.047 121.462 253.681 121.462 254.071 121.071C254.462 120.681 254.462 120.047 254.071 119.657L247.707 113.293ZM248 145L248 114L246 114L246 145L248 145Z" fill="black"/>
<line x1="248" y1="20" x2="130" y2="20" stroke="black" stroke-width="2"/>
<line x1="248" y1="145" x2="130" y2="145" stroke="black" stroke-width="2"/>
</svg>

After

Width:  |  Height:  |  Size: 1.7 KiB

@@ -0,0 +1,52 @@
<svg width="387" height="347" viewBox="0 0 387 347" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect width="387" height="34" fill="white"/>
<line y1="35" x2="387" y2="35" stroke="#BDBDBD" stroke-width="2"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="31" y="24.0576">Topic</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="82" y="24.0576">Count&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="338" y="23.0576">Name</tspan></text>
<rect y="36" width="387" height="34" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="58.0576">0</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="58.0576">0</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="59.0576">999&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="215.92" y="59.0576">0_game_hockey_team_25&#10;</tspan></text>
<rect y="70" width="387" height="34" fill="#F5F5F5"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="210.232" y="92.0576">1_god_church_jesus_christ&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="92.0576">997&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="92.0576">1</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="92.0576">1</tspan></text>
<rect y="104" width="387" height="34" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="126.058">2</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="126.058">2</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="127.058">996&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="236.551" y="127.058">2_bike_dod_ride_bikes&#10;</tspan></text>
<rect y="138" width="387" height="34" fill="#F5F5F5"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="213.445" y="160.058">3_baseball_game_he_year&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="160.058">994&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="160.058">3</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="160.058">3</tspan></text>
<rect y="172" width="387" height="34" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="194.058">4</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="194.058">4</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="195.058">991&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="197.545" y="195.058">4_key_encryption_db_clipper&#10;</tspan></text>
<rect y="206" width="387" height="34" fill="#F5F5F5"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="229.674" y="228.058">5_car_cars_engine_ford&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="228.058">990</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="228.058">5</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="228.058">5</tspan></text>
<rect y="240" width="387" height="34" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="262.058">6</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="262.058">6</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="263.058">990</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="157.568" y="263.058">6_medical_patients_cancer_disease&#10;</tspan></text>
<rect y="274" width="387" height="34" fill="#F5F5F5"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="181.781" y="296.058">7_window_server_widget_motif&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="296.058">988&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="296.058">7</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="296.058">7</tspan></text>
<rect y="308" width="387" height="34" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="330.058">8</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="330.058">8</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="331.058">988&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="207.129" y="331.058">8_space_launch_nasa_orbit&#10;</tspan></text>
</svg>

After

Width:  |  Height:  |  Size: 7.6 KiB

@@ -0,0 +1,62 @@
<svg width="550" height="347" viewBox="0 0 550 347" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect width="550" height="34" fill="white"/>
<line y1="35" x2="547" y2="35" stroke="#BDBDBD" stroke-width="2"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="31" y="24.0576">Topic</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="82" y="24.0576">Count&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="338" y="23.0576">Name</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="498" y="23.0576">Class</tspan></text>
<rect y="36" width="550" height="34" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="58.0576">0</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="58.0576">0</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="59.0576">999&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="215.92" y="59.0576">0_game_hockey_team_25&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="436.131" y="60.0576">rec.sport.hockey&#10;</tspan></text>
<rect y="70" width="550" height="34" fill="#F5F5F5"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="210.232" y="92.0576">1_god_church_jesus_christ&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="92.0576">997&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="92.0576">1</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="92.0576">1</tspan></text>
<rect y="104" width="550" height="34" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="126.058">2</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="126.058">2</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="127.058">996&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="236.551" y="127.058">2_bike_dod_ride_bikes&#10;</tspan></text>
<rect y="138" width="550" height="34" fill="#F5F5F5"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="213.445" y="160.058">3_baseball_game_he_year&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="160.058">994&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="160.058">3</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="160.058">3</tspan></text>
<rect y="172" width="550" height="34" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="194.058">4</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="194.058">4</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="195.058">991&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="197.545" y="195.058">4_key_encryption_db_clipper&#10;</tspan></text>
<rect y="206" width="550" height="34" fill="#F5F5F5"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="229.674" y="228.058">5_car_cars_engine_ford&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="228.058">990</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="228.058">5</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="228.058">5</tspan></text>
<rect y="240" width="550" height="34" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="262.058">6</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="262.058">6</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="263.058">990</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="157.568" y="263.058">6_medical_patients_cancer_disease&#10;</tspan></text>
<rect y="274" width="550" height="34" fill="#F5F5F5"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="181.781" y="296.058">7_window_server_widget_motif&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="296.058">988&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="296.058">7</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="296.058">7</tspan></text>
<rect y="308" width="550" height="34" fill="white"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="7" y="330.058">8</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="57.3574" y="330.058">8</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="104.072" y="331.058">988&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="207.129" y="331.058">8_space_launch_nasa_orbit&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="482.67" y="331.058">sci.space&#10;</tspan><tspan x="538" y="348.058">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="435.652" y="297.058">comp.windows.x&#10;</tspan><tspan x="538" y="314.058">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="490.982" y="264.058">sci.med&#10;</tspan><tspan x="538" y="281.058">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="481.18" y="229.058">rec.autos&#10;</tspan><tspan x="538" y="246.058">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="486.936" y="195.058">sci.crypt&#10;</tspan><tspan x="538" y="212.058">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="429.117" y="160.058">rec.sport.baseball&#10;</tspan><tspan x="538" y="177.058">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="441.463" y="127.058">rec.motorcycles&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" letter-spacing="0em"><tspan x="412.232" y="92.0576">soc.religion.christian&#10;</tspan></text>
</svg>

After

Width:  |  Height:  |  Size: 9.7 KiB

@@ -0,0 +1,107 @@
# Merge Multiple Fitted Models
After you have trained a new BERTopic model on your data, new data might still be coming in. Although you can use [online BERTopic](https://maartengr.github.io/BERTopic/getting_started/online/online.html), you might prefer to use the default HDBSCAN and UMAP models since they do not support incremental learning out of the box.
Instead, we you can train a new BERTopic on incoming data and merge it with your base model to detect whether new topics have appeared in the unseen documents. This is a great way of detecting whether your new model contains information that was not previously found in your base topic model.
Similarly, you might want to train multiple BERTopic models using different sets of settings, even though they might all be using the same underlying embedding model. Merging these models would also allow for a single model that you can use throughout your use cases.
Lastly, this methods also allows for a degree of `federated learning` where each node trains a topic model that are aggregated in a central server.
## **Example**
To demonstrate merging different topic models with BERTopic, we use the ArXiv paper abstracts to see which topics they generally contain.
First, we train three separate models on different parts of the data:
```python
from umap import UMAP
from bertopic import BERTopic
from datasets import load_dataset
dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]
# Extract abstracts to train on and corresponding titles
abstracts_1 = dataset["abstract"][:5_000]
abstracts_2 = dataset["abstract"][5_000:10_000]
abstracts_3 = dataset["abstract"][10_000:15_000]
# Create topic models
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
topic_model_1 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_1)
topic_model_2 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_2)
topic_model_3 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_3)
```
Then, we can combine all three models into one with `.merge_models`:
```python
# Combine all models into one
merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])
```
When we inspect the first model, we can see it has 52 topics:
```python
>>> len(topic_model_1.get_topic_info())
52
```
Now, we inspect the merged model, we can see it has 57 topics:
```python
>>> len(merged_model.get_topic_info())
57
```
It seems that by merging these three models, there were 6 undiscovered topics that we could add to the very first model.
!!! Note
Note that the models are merged sequentially. This means that the comparison starts with `topic_model_1` and that
each new topic from `topic_model_2` and `topic_model_3` will be added to `topic_model_1`.
We can check the newly added topics in the `merged_model` by simply looking at the 6 latest topics that were added. The order of topics from `topic_model_1`
remains the same. All new topics are simply added on top of them.
Let's inspect them:
```python
>>> merged_model.get_topic_info().tail(5)
```
| | Topic | Count | Name | Representation | Representative_Docs |
|---:|--------:|--------:|:---------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|----------------------:|
| 52 | 51 | 47 | 50_activity_mobile_wearable_sensors | ['activity', 'mobile', 'wearable', 'sensors', 'falls', 'human', 'phone', 'recognition', 'activities', 'accelerometer'] | nan |
| 53 | 52 | 48 | 25_music_musical_audio_chord | ['music', 'musical', 'audio', 'chord', 'and', 'we', 'to', 'that', 'of', 'for'] | nan |
| 54 | 53 | 32 | 36_fairness_discrimination_fair_groups | ['fairness', 'discrimination', 'fair', 'groups', 'protected', 'decision', 'we', 'of', 'classifier', 'to'] | nan |
| 55 | 54 | 30 | 38_traffic_driver_prediction_flow | ['traffic', 'driver', 'prediction', 'flow', 'trajectory', 'the', 'and', 'congestion', 'of', 'transportation'] | nan |
| 56 | 55 | 22 | 50_spiking_neurons_networks_learning | ['spiking', 'neurons', 'networks', 'learning', 'neural', 'snn', 'dynamics', 'plasticity', 'snns', 'of'] | nan |
It seems that topics about activity, music, fairness, traffic, and spiking networks were added to the base topic model! Two things that you might have noticed. First,
the representative documents were not added to the model. This is because of privacy reasons, you might want to combine models that were trained on different stations which
would allow for a degree of `federated learning`. Second, the names of the new topics contain topic ids that refer to one of the old models. They were purposefully left this way
so that the user can identify which topics were newly added which you could inspect in the original models.
## **min_similarity**
The way the models are merged is through comparison of their topic embeddings. If topics between models are similar enough, then they will be regarded as the same topics
and the topic of the first model in the list will be chosen. However, if topics between models are dissimilar enough, then the topic of the latter model will be added to the former.
This (dis)similarity is can be tweaked using the `min_similarity` parameter. Increasing this value will increase the chance of adding new topics. In contrast, decreasing this value
will make it more strict and threfore decrease the chance of adding new topics. The value is set to `0.7` by default, so let's see what happens if we were to increase this value to
`0.9``:
```python
# Combine all models into one
merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3], min_similarity=0.9)
```
When we inspect the number of topics in our new model, we can see that they have increased quite a bit:
```python
>>> len(merged_model.get_topic_info())
102
```
This demonstrates the influence of `min_similarity` on the number of new topics that are added to the base model.
@@ -0,0 +1,46 @@
During the development of BERTopic, many different types of representations can be created, from keywords and phrases to summaries and custom labels. There is a variety of techniques that one can choose from to represent a topic. As such, there are a number of interesting and creative ways one can summarize topics. A topic is more than just a single representation.
Therefore, `multi-aspect topic modeling` is introduced! During the `.fit` or `.fit_transform` stages, you can now get multiple representations of a single topic. In practice, it works by generating and storing all kinds of different topic representations (see image below).
<figure markdown>
![Image title](multiaspect.svg)
<figcaption></figcaption>
</figure>
The approach is rather straightforward. We might want to represent our topics using a `PartOfSpeech` representation model but we might also want to try out `KeyBERTInspired` and compare those representation models. We can do this as follows:
```python
from bertopic.representation import KeyBERTInspired
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance
from sklearn.datasets import fetch_20newsgroups
# Documents to train on
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
# The main representation of a topic
main_representation = KeyBERTInspired()
# Additional ways of representing a topic
aspect_model1 = PartOfSpeech("en_core_web_sm")
aspect_model2 = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)]
# Add all models together to be run in a single `fit`
representation_model = {
"Main": main_representation,
"Aspect1": aspect_model1,
"Aspect2": aspect_model2
}
topic_model = BERTopic(representation_model=representation_model).fit(docs)
```
As show above, to perform multi-aspect topic modeling, we make sure that `representation_model` is a dictionary where each representation model pipeline is defined.
The main pipeline, that is used in most visualization options, is defined with the `"Main"` key. All other aspects can be defined however you want. In the example above, the two additional aspects that we are interested in are defined as `"Aspect1"` and `"Aspect2"`.
After we have fitted our model, we can access all representations with `topic_model.get_topic_info()`:
<br><br>
<img src="table.PNG">
<br><br>
As you can see, there are a number of different representations for our topics that we can inspect. All aspects are found in `topic_model.topic_aspects_`.
@@ -0,0 +1,68 @@
<svg width="398" height="426" viewBox="0 0 398 426" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="125" y="388" width="118" height="38" fill="#64B5F6"/>
<rect x="217" y="378" width="20" height="8" fill="#64B5F6"/>
<rect x="189" y="378" width="20" height="8" fill="#64B5F6"/>
<rect x="161" y="378" width="20" height="8" fill="#64B5F6"/>
<rect x="133" y="378" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="151.256" y="415.939">SBERT</tspan></text>
<rect x="125" y="348" width="118" height="38" fill="#E57373"/>
<rect x="217" y="338" width="20" height="8" fill="#E57373"/>
<rect x="189" y="338" width="20" height="8" fill="#E57373"/>
<rect x="161" y="338" width="20" height="8" fill="#E57373"/>
<rect x="133" y="338" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="154.254" y="375.939">UMAP</tspan></text>
<rect x="125" y="308" width="118" height="38" fill="#4DB6AC"/>
<rect x="217" y="298" width="20" height="8" fill="#4DB6AC"/>
<rect x="189" y="298" width="20" height="8" fill="#4DB6AC"/>
<rect x="161" y="298" width="20" height="8" fill="#4DB6AC"/>
<rect x="133" y="298" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="134.342" y="335.939">HDBSCAN</tspan></text>
<rect x="125" y="268" width="118" height="38" fill="#FFD54F"/>
<rect x="217" y="258" width="20" height="8" fill="#FFD54F"/>
<rect x="189" y="258" width="20" height="8" fill="#FFD54F"/>
<rect x="161" y="258" width="20" height="8" fill="#FFD54F"/>
<rect x="133" y="258" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="131.346" y="291.161">CountVectorizer</tspan></text>
<rect x="125" y="228" width="118" height="38" fill="#90A4AE"/>
<rect x="217" y="218" width="20" height="8" fill="#90A4AE"/>
<rect x="189" y="218" width="20" height="8" fill="#90A4AE"/>
<rect x="161" y="218" width="20" height="8" fill="#90A4AE"/>
<rect x="133" y="218" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="139.938" y="255.939">c-TF-IDF</tspan></text>
<rect x="125" y="98" width="118" height="38" fill="#3F51B5"/>
<rect x="217" y="88" width="20" height="8" fill="#3F51B5"/>
<rect x="189" y="88" width="20" height="8" fill="#3F51B5"/>
<rect x="161" y="88" width="20" height="8" fill="#3F51B5"/>
<rect x="133" y="88" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="155.804" y="120.161">ChatGPT</tspan></text>
<rect y="98" width="118" height="38" fill="#3F51B5"/>
<rect x="92" y="88" width="20" height="8" fill="#3F51B5"/>
<rect x="64" y="88" width="20" height="8" fill="#3F51B5"/>
<rect x="36" y="88" width="20" height="8" fill="#3F51B5"/>
<rect x="8" y="88" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="2.22656" y="120.161">KeyBERTInspired</tspan></text>
<rect y="58" width="118" height="38" fill="#3F51B5"/>
<rect x="92" y="48" width="20" height="8" fill="#3F51B5"/>
<rect x="64" y="48" width="20" height="8" fill="#3F51B5"/>
<rect x="36" y="48" width="20" height="8" fill="#3F51B5"/>
<rect x="8" y="48" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="2.76611" y="72.1606">MaximalMarginal</tspan><tspan x="25.4907" y="88.1606">Relevance</tspan></text>
<rect x="280" y="98" width="118" height="38" fill="#3F51B5"/>
<rect x="372" y="88" width="20" height="8" fill="#3F51B5"/>
<rect x="344" y="88" width="20" height="8" fill="#3F51B5"/>
<rect x="316" y="88" width="20" height="8" fill="#3F51B5"/>
<rect x="288" y="88" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="309.065" y="114.058">Optional&#10;</tspan><tspan x="298.271" y="131.058">Fine-tuning</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" letter-spacing="0em"><tspan x="34.4282" y="153.161">Aspect 1</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="152.077" y="27.9697">Create multiple topic representations, or aspects, </tspan><tspan x="168.107" y="39.9697">simultaneously. Topics are more than just </tspan><tspan x="152.229" y="51.9697">keywords and could be represented by a number </tspan><tspan x="221.813" y="63.9697">of ways together.</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" letter-spacing="0em"><tspan x="158.428" y="153.161">Aspect 2</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" letter-spacing="0em"><tspan x="312.352" y="153.161">Aspect n</tspan></text>
<path d="M52.7071 174.293C52.3166 173.902 51.6834 173.902 51.2929 174.293L44.9289 180.657C44.5384 181.047 44.5384 181.681 44.9289 182.071C45.3195 182.462 45.9526 182.462 46.3431 182.071L52 176.414L57.6569 182.071C58.0474 182.462 58.6805 182.462 59.0711 182.071C59.4616 181.681 59.4616 181.047 59.0711 180.657L52.7071 174.293ZM53 191L53 175L51 175L51 191L53 191Z" fill="black"/>
<path d="M185.707 174.293C185.317 173.902 184.683 173.902 184.293 174.293L177.929 180.657C177.538 181.047 177.538 181.681 177.929 182.071C178.319 182.462 178.953 182.462 179.343 182.071L185 176.414L190.657 182.071C191.047 182.462 191.681 182.462 192.071 182.071C192.462 181.681 192.462 181.047 192.071 180.657L185.707 174.293ZM186 191L186 175L184 175L184 191L186 191Z" fill="black"/>
<path d="M352.707 174.293C352.317 173.902 351.683 173.902 351.293 174.293L344.929 180.657C344.538 181.047 344.538 181.681 344.929 182.071C345.319 182.462 345.953 182.462 346.343 182.071L352 176.414L357.657 182.071C358.047 182.462 358.681 182.462 359.071 182.071C359.462 181.681 359.462 181.047 359.071 180.657L352.707 174.293ZM353 191L353 175L351 175L351 191L353 191Z" fill="black"/>
<line x1="52" y1="190" x2="352" y2="190" stroke="black" stroke-width="2"/>
<line x1="185" y1="191" x2="185" y2="207" stroke="black" stroke-width="2"/>
<circle cx="251.5" cy="117.5" r="2.5" fill="black"/>
<circle cx="261.5" cy="117.5" r="2.5" fill="black"/>
<circle cx="271.5" cy="117.5" r="2.5" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 7.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 246 KiB

@@ -0,0 +1,32 @@
<svg width="652" height="186" viewBox="0 0 652 186" fill="none" xmlns="http://www.w3.org/2000/svg">
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="22" y="76.8636">Images</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="15.3008" y="41.9697">Embed text, </tspan><tspan x="8.82129" y="53.9697">images or both </tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="390.892" y="55.9697">For each topic, find the best matching images </tspan><tspan x="392.127" y="67.9697">based on the most representative documents</tspan></text>
<rect x="14.5" y="59.5" width="56" height="27" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="29" y="175.864">Text</tspan></text>
<rect x="14.5" y="158.5" width="56" height="27" stroke="black"/>
<line x1="79" y1="123" x2="42" y2="123" stroke="black" stroke-width="2"/>
<line x1="607" y1="73" x2="79" y2="73" stroke="black" stroke-width="2"/>
<line x1="43" y1="97" x2="43" y2="122" stroke="black" stroke-width="2"/>
<line x1="43" y1="123" x2="43" y2="152" stroke="black" stroke-width="2"/>
<rect x="118" y="90" width="534" height="57" fill="white"/>
<rect x="118.5" y="104.5" width="88" height="42" fill="white" stroke="black"/>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="134.238" y="158.97">clip-ViT-B-32</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="301" y="158.97">UMAP</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="437.149" y="158.97">HDBSCAN</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="586" y="158.97">c-TF-IDF</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="127" y="128.764">Embeddings</tspan></text>
<rect x="260.5" y="104.5" width="105" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="274.094" y="123.764">Dimensionality &#10;</tspan><tspan x="289.762" y="137.764">reduction</tspan></text>
<path d="M244.707 123.707C245.098 123.317 245.098 122.683 244.707 122.293L238.343 115.929C237.953 115.538 237.319 115.538 236.929 115.929C236.538 116.319 236.538 116.953 236.929 117.343L242.586 123L236.929 128.657C236.538 129.047 236.538 129.681 236.929 130.071C237.319 130.462 237.953 130.462 238.343 130.071L244.707 123.707ZM217 124H244V122H217V124Z" fill="black"/>
<path d="M104.707 123.707C105.098 123.317 105.098 122.683 104.707 122.293L98.3431 115.929C97.9526 115.538 97.3195 115.538 96.9289 115.929C96.5384 116.319 96.5384 116.953 96.9289 117.343L102.586 123L96.9289 128.657C96.5384 129.047 96.5384 129.681 96.9289 130.071C97.3195 130.462 97.9526 130.462 98.3431 130.071L104.707 123.707ZM77 124H104V122H77V124Z" fill="black"/>
<rect x="413.5" y="104.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="435" y="128.764">Clustering</tspan></text>
<path d="M403.707 123.707C404.098 123.317 404.098 122.683 403.707 122.293L397.343 115.929C396.953 115.538 396.319 115.538 395.929 115.929C395.538 116.319 395.538 116.953 395.929 117.343L401.586 123L395.929 128.657C395.538 129.047 395.538 129.681 395.929 130.071C396.319 130.462 396.953 130.462 397.343 130.071L403.707 123.707ZM376 124H403V122H376V124Z" fill="black"/>
<rect x="560.5" y="104.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="590.404" y="120.764">Topic &#10;</tspan><tspan x="568.215" y="134.764">representation</tspan></text>
<rect x="560.5" y="0.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="565.232" y="16.7637">Representative &#10;</tspan><tspan x="587.785" y="30.7637">images</tspan></text>
<path d="M544.707 123.707C545.098 123.317 545.098 122.683 544.707 122.293L538.343 115.929C537.953 115.538 537.319 115.538 536.929 115.929C536.538 116.319 536.538 116.953 536.929 117.343L542.586 123L536.929 128.657C536.538 129.047 536.538 129.681 536.929 130.071C537.319 130.462 537.953 130.462 538.343 130.071L544.707 123.707ZM517 124H544V122H517V124Z" fill="black"/>
<path d="M607.707 51.2929C607.317 50.9024 606.683 50.9024 606.293 51.2929L599.929 57.6569C599.538 58.0474 599.538 58.6805 599.929 59.0711C600.319 59.4616 600.953 59.4616 601.343 59.0711L607 53.4142L612.657 59.0711C613.047 59.4616 613.681 59.4616 614.071 59.0711C614.462 58.6805 614.462 58.0474 614.071 57.6569L607.707 51.2929ZM608 98L608 52L606 52L606 98L608 98Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 209 KiB

@@ -0,0 +1,32 @@
<svg width="803" height="169" viewBox="0 0 803 169" fill="none" xmlns="http://www.w3.org/2000/svg">
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="21" y="127.864">Images</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="9.67578" y="158.97">Embed images</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="542.892" y="55.9697">For each topic, find the best matching images </tspan><tspan x="544.127" y="67.9697">based on the most representative documents</tspan></text>
<rect x="13.5" y="104.5" width="60" height="42" stroke="black"/>
<line x1="757" y1="73" x2="41" y2="73" stroke="black" stroke-width="2"/>
<line x1="40" y1="94" x2="40" y2="72" stroke="black" stroke-width="2"/>
<rect x="120" y="90" width="534" height="57" fill="white"/>
<rect x="120.5" y="104.5" width="88" height="42" fill="white" stroke="black"/>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="136.238" y="158.97">clip-ViT-B-32</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="303" y="158.97">UMAP</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="439.149" y="158.97">HDBSCAN</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="557" y="158.97">vit-gpt2-image-captioning</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="738" y="158.97">c-TF-IDF</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="129" y="128.764">Embeddings</tspan></text>
<rect x="262.5" y="104.5" width="105" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="276.094" y="123.764">Dimensionality &#10;</tspan><tspan x="291.762" y="137.764">reduction</tspan></text>
<path d="M246.707 123.707C247.098 123.317 247.098 122.683 246.707 122.293L240.343 115.929C239.953 115.538 239.319 115.538 238.929 115.929C238.538 116.319 238.538 116.953 238.929 117.343L244.586 123L238.929 128.657C238.538 129.047 238.538 129.681 238.929 130.071C239.319 130.462 239.953 130.462 240.343 130.071L246.707 123.707ZM219 124H246V122H219V124Z" fill="black"/>
<path d="M106.707 123.707C107.098 123.317 107.098 122.683 106.707 122.293L100.343 115.929C99.9526 115.538 99.3195 115.538 98.9289 115.929C98.5384 116.319 98.5384 116.953 98.9289 117.343L104.586 123L98.9289 128.657C98.5384 129.047 98.5384 129.681 98.9289 130.071C99.3195 130.462 99.9526 130.462 100.343 130.071L106.707 123.707ZM79 124H106V122H79V124Z" fill="black"/>
<rect x="415.5" y="104.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="437" y="128.764">Clustering</tspan></text>
<path d="M405.707 123.707C406.098 123.317 406.098 122.683 405.707 122.293L399.343 115.929C398.953 115.538 398.319 115.538 397.929 115.929C397.538 116.319 397.538 116.953 397.929 117.343L403.586 123L397.929 128.657C397.538 129.047 397.538 129.681 397.929 130.071C398.319 130.462 398.953 130.462 399.343 130.071L405.707 123.707ZM378 124H405V122H378V124Z" fill="black"/>
<rect x="562.5" y="104.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="590.224" y="116.97">Caption &#10;</tspan><tspan x="574.11" y="128.97">Representative &#10;</tspan><tspan x="592.182" y="140.97">Images</tspan></text>
<rect x="710.5" y="0.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="715.232" y="16.7637">Representative &#10;</tspan><tspan x="737.785" y="30.7637">images</tspan></text>
<path d="M546.707 123.707C547.098 123.317 547.098 122.683 546.707 122.293L540.343 115.929C539.953 115.538 539.319 115.538 538.929 115.929C538.538 116.319 538.538 116.953 538.929 117.343L544.586 123L538.929 128.657C538.538 129.047 538.538 129.681 538.929 130.071C539.319 130.462 539.953 130.462 540.343 130.071L546.707 123.707ZM519 124H546V122H519V124Z" fill="black"/>
<rect x="711.5" y="104.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="741.404" y="120.764">Topic &#10;</tspan><tspan x="719.215" y="134.764">representation</tspan></text>
<path d="M695.707 123.707C696.098 123.317 696.098 122.683 695.707 122.293L689.343 115.929C688.953 115.538 688.319 115.538 687.929 115.929C687.538 116.319 687.538 116.953 687.929 117.343L693.586 123L687.929 128.657C687.538 129.047 687.538 129.681 687.929 130.071C688.319 130.462 688.953 130.462 689.343 130.071L695.707 123.707ZM668 124H695V122H668V124Z" fill="black"/>
<path d="M757.707 51.2929C757.317 50.9024 756.683 50.9024 756.293 51.2929L749.929 57.6569C749.538 58.0474 749.538 58.6805 749.929 59.0711C750.319 59.4616 750.953 59.4616 751.343 59.0711L757 53.4142L762.657 59.0711C763.047 59.4616 763.681 59.4616 764.071 59.0711C764.462 58.6805 764.462 58.0474 764.071 57.6569L757.707 51.2929ZM758 98L758 52L756 52L756 98L758 98Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 5.8 KiB

@@ -0,0 +1,190 @@
Documents or text are often accompanied by imagery or the other way around. For example, social media images with captions and products with descriptions. Topic modeling has traditionally focused on creating topics from textual representations. However, as more multimodal representations are created, the need for multimodal topics increases.
BERTopic can perform **multimodal topic modeling** in a number of ways during `.fit` and `.fit_transform` stages.
## **Text + Images**
The most basic example of multimodal topic modeling in BERTopic is when you have images that accompany your documents. This means that it is expected that each document has an image and vice versa. Instagram pictures, for example, almost always have some descriptions to them.
<figure markdown>
![Image title](images_and_text.svg)
<figcaption></figcaption>
</figure>
In this example, we are going to use images from `flickr` that each have a caption associated to it:
```python
# NOTE: This requires the `datasets` package which you can
# install with `pip install datasets`
from datasets import load_dataset
ds = load_dataset("maderix/flickr_bw_rgb")
images = ds["train"]["image"]
docs = ds["train"]["caption"]
```
The `docs` variable contains the captions for each image in `images`. We can now use these variables to run our multimodal example:
!!! Tip
Do note that it is better to pass the paths of the images instead of the images themselves as there is no need to keep all images in memory. When passing the paths of the images, they are only opened temporarily when they are needed.
```python
from bertopic import BERTopic
from bertopic.representation import VisualRepresentation
# Additional ways of representing a topic
visual_model = VisualRepresentation()
# Make sure to add the `visual_model` to a dictionary
representation_model = {
"Visual_Aspect": visual_model,
}
topic_model = BERTopic(representation_model=representation_model, verbose=True)
```
In this example, we are clustering the documents and are then looking for the best matching images to the resulting clusters.
We can now access our image representations for each topic with `topic_model.topic_aspects_["Visual_Aspect"]`.
If you want an overview of the topic images together with their textual representations in jupyter, you can run the following:
```python
import base64
from io import BytesIO
from IPython.display import HTML
def image_base64(im):
if isinstance(im, str):
im = get_thumbnail(im)
with BytesIO() as buffer:
im.save(buffer, 'jpeg')
return base64.b64encode(buffer.getvalue()).decode()
def image_formatter(im):
return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'
# Extract dataframe
df = topic_model.get_topic_info().drop("Representative_Docs", 1).drop("Name", 1)
# Visualize the images
HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))
```
<br><br>
<img src="images_and_text.jpg">
<br><br>
!!! Tip
In the example above, we are clustering the documents but since you have
images, you might want to cluster those or cluster an aggregation of both
images and documents. For that, you can use the new `MultiModalBackend`
to generate embeddings:
```python
from bertopic.backend import MultiModalBackend
model = MultiModalBackend('clip-ViT-B-32', batch_size=32)
# Embed documents only
doc_embeddings = model.embed_documents(docs)
# Embedding images only
image_embeddings = model.embed_images(images)
# Embed both images and documents, then average them
doc_image_embeddings = model.embed(docs, images)
```
## **Images Only**
Traditional topic modeling techniques can only be run on textual data, as is shown in the example above. However, there are plenty of cases where textual data is not available but images are. BERTopic allows topic modeling to be performed using only images as your input data.
<figure markdown>
![Image title](images_only.svg)
<figcaption></figcaption>
</figure>
To run BERTopic on images only, we first need to embed our images and then define a model that convert images to text. To do so, we are going to need some images. We will take the same images as the above but instead save them locally and pass the paths to the images instead. As mentioned before, this will make sure that we do not hold too many images in memory whilst only a small subset is needed:
```python
import os
import glob
import zipfile
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import util
# Flickr 8k images
img_folder = 'photos/'
caps_folder = 'captions/'
if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:
os.makedirs(img_folder, exist_ok=True)
if not os.path.exists('Flickr8k_Dataset.zip'): #Download dataset if does not exist
util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip', 'Flickr8k_Dataset.zip')
util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip', 'Flickr8k_text.zip')
for folder, file in [(img_folder, 'Flickr8k_Dataset.zip'), (caps_folder, 'Flickr8k_text.zip')]:
with zipfile.ZipFile(file, 'r') as zf:
for member in tqdm(zf.infolist(), desc='Extracting'):
zf.extract(member, folder)
images = list(glob.glob('photos/Flicker8k_Dataset/*.jpg'))
```
Next, we can run our pipeline:
```python
from bertopic.representation import KeyBERTInspired, VisualRepresentation
from bertopic.backend import MultiModalBackend
# Image embedding model
embedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32)
# Image to text representation model
representation_model = {
"Visual_Aspect": VisualRepresentation(image_to_text_model="nlpconnect/vit-gpt2-image-captioning")
}
```
Using these models, we can run our pipeline:
```python
from bertopic import BERTopic
# Train our model with images only
topic_model = BERTopic(embedding_model=embedding_model, representation_model=representation_model, min_topic_size=30)
topics, probs = topic_model.fit_transform(documents=None, images=images)
```
We can now access our image representations for each topic with `topic_model.topic_aspects_["Visual_Aspect"]`.
If you want an overview of the topic images together with their textual representations in jupyter, you can run the following:
```python
import base64
from io import BytesIO
from IPython.display import HTML
def image_base64(im):
if isinstance(im, str):
im = get_thumbnail(im)
with BytesIO() as buffer:
im.save(buffer, 'jpeg')
return base64.b64encode(buffer.getvalue()).decode()
def image_formatter(im):
return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'
# Extract dataframe
df = topic_model.get_topic_info().drop("Representative_Docs", 1).drop("Name", 1)
# Visualize the images
HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))
```
<br><br>
<img src="images_only.jpg">
<br><br>
@@ -0,0 +1,140 @@
Online topic modeling (sometimes called "incremental topic modeling") is the ability to learn incrementally from a mini-batch of instances. Essentially, it is a way to update your topic model with data on which it was not trained before. In Scikit-Learn, this technique is often modeled through a `.partial_fit` function, which is also used in BERTopic.
!!! Tip
Another method for online topic modeling can be found with the [**.merge_models**](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html) functionality of BERTopic. It allows for merging multiple BERTopic models to create a single new one. This method can be used to discover new topics by training a new model and exploring whether that new model added new topics to the original model when merging. A major benefit, compared to `.partial_fit` is that you can keep using the original UMAP and HDBSCAN models which tends result in improved performance and gives you significant more flexibility.
In BERTopic, there are three main goals for using this technique.
* To reduce the memory necessary for training a topic model.
* To continuously update the topic model as new data comes in.
* To continuously find new topics as new data comes in.
In BERTopic, online topic modeling can be a bit tricky as there are several steps involved in which online learning needs to be made available. To recap, BERTopic consists of the following 6 steps:
1. Extract embeddings
2. Reduce dimensionality
3. Cluster reduced embeddings
4. Tokenize topics
5. Extract topic words
6. (Optional) Fine-tune topic words
For some steps, an online variant is more important than others. Typically, in step 1 we use pre-trained language models that are in less need of continuous updates. This means that we can use an embedding model like Sentence-Transformers for extracting the embeddings and still use it in an online setting. Similarly, steps 5 and 6 do not necessarily need online variants since they are built upon step 4, tokenization. If that tokenization is by itself incremental, then so will steps 5 and 6.
<br>
<div class="svg_image">
--8<-- "docs/getting_started/online/online.svg"
</div>
<br>
This means that we will need online variants for steps 2 through 4. Steps 2 and 3, dimensionality reduction and clustering, can be modeled through the use of Scikit-Learn's `.partial_fit` function. In other words, it supports any algorithm that can be trained using `.partial_fit` since these algorithms can be trained incrementally. For example, incremental dimensionality reduction can be achieved using Scikit-Learn's `IncrementalPCA` and incremental clustering with `MiniBatchKMeans`.
Lastly, we need to develop an online variant for step 5, tokenization. In this step, a Bag-of-words representation is created through the `CountVectorizer`. However, as new data comes in, its vocabulary will need to be updated. For that purpose, `bertopic.vectorizers.OnlineCountVectorizer` was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. Most notably, the `decay` parameter is a value between 0 and 1 to weigh the percentage of frequencies that the previous bag-of-words matrix should be reduced to. For example, a value of `.1` will decrease the frequencies in the bag-of-words matrix by 10% at each iteration. This will make sure that recent data has more weight than previous iterations. Similarly, `delete_min_df` will remove certain words from its vocabulary if their frequency is lower than a set value. This ties together with the `decay` parameter as some words will decay over time if not used. For more information regarding the `OnlineCountVectorizer`, please see the [vectorizers documentation](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html#onlinecountvectorizer).
## **Example**
Online topic modeling in BERTopic is rather straightforward. We first need to have our documents split into chunks such that we can train and update our topic model incrementally.
```python
from sklearn.datasets import fetch_20newsgroups
# Prepare documents
all_docs = fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes'))["data"]
doc_chunks = [all_docs[i:i+1000] for i in range(0, len(all_docs), 1000)]
```
Here, we created chunks of 1000 documents to be fed in BERTopic. Then, we will need to define several sub-models that support online learning. Specifically, we are going to be using `IncrementalPCA`, `MiniBatchKMeans`, and the `OnlineCountVectorizer`:
```python
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA
from bertopic.vectorizers import OnlineCountVectorizer
# Prepare sub-models that support online learning
umap_model = IncrementalPCA(n_components=5)
cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)
vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=.01)
```
After having defined our sub-models, we can start training our topic model incrementally by looping over our document chunks:
```python
from bertopic import BERTopic
topic_model = BERTopic(umap_model=umap_model,
hdbscan_model=cluster_model,
vectorizer_model=vectorizer_model)
# Incrementally fit the topic model by training on 1000 documents at a time
for docs in doc_chunks:
topic_model.partial_fit(docs)
```
And that is it! During each iteration, you can access the predicted topics through the `.topics_` attribute.
!!! note
Do note that in BERTopic it is not possible to use `.partial_fit` after the `.fit` as they work quite differently concerning internally updating topics, frequencies, representations, etc.
!!! tip Tip
You can use any other dimensionality reduction and clustering algorithm as long as they have a `.partial_fit` function. Moreover, you can use dimensionality reduction algorithms that do not support `.partial_fit` functions but do have a `.fit` function to first train it on a large amount of data and then continuously add documents. The dimensionality reduction will not be updated but may be trained sufficiently to properly reduce the embeddings without the need to continuously add documents.
!!! warning
Only the most recent batch of documents is tracked. If you want to be using online topic modeling for low-memory use cases, then it is advised to also update the `.topics_` attribute. Otherwise, variations such as **hierarchical topic modeling** will not work.
```python
# Incrementally fit the topic model by training on 1000 documents at a time and track the topics in each iteration
topics = []
for docs in doc_chunks:
topic_model.partial_fit(docs)
topics.extend(topic_model.topics_)
topic_model.topics_ = topics
```
## **River**
To continuously find new topics as they come in, we can use the package [river](https://github.com/online-ml/river). It contains several clustering models that can create new clusters as new data comes in. To make sure we can use their models, we first need to create a class that has a `.partial_fit` function and the option to extract labels through `.labels_`:
```python
from river import stream
from river import cluster
class River:
def __init__(self, model):
self.model = model
def partial_fit(self, umap_embeddings):
for umap_embedding, _ in stream.iter_array(umap_embeddings):
self.model.learn_one(umap_embedding)
labels = []
for umap_embedding, _ in stream.iter_array(umap_embeddings):
label = self.model.predict_one(umap_embedding)
labels.append(label)
self.labels_ = labels
return self
```
Then, we can choose any `river.cluster` model that we are interested in and pass it to the `River` class before using it in BERTopic:
```python
# Using DBSTREAM to detect new topics as they come in
cluster_model = River(cluster.DBSTREAM())
vectorizer_model = OnlineCountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)
# Prepare model
topic_model = BERTopic(
hdbscan_model=cluster_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
)
# Incrementally fit the topic model by training on 1000 documents at a time
for docs in doc_chunks:
topic_model.partial_fit(docs)
```
@@ -0,0 +1,26 @@
<svg width="684" height="125" viewBox="0 0 684 125" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect width="534" height="57" fill="white"/>
<rect x="0.5" y="14.5" width="88" height="42" fill="white" stroke="black"/>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="30" y="10.9697">SBERT</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="162" y="10.9697">IncrementalPCA&#10;</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="304" y="10.9697">MiniBatchKMeans&#10;</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="437" y="10.9697">Online CountVectorizer</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="9" y="38.7637">Embeddings</tspan></text>
<rect x="142.5" y="14.5" width="105" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="156.094" y="33.7637">Dimensionality &#10;</tspan><tspan x="171.762" y="47.7637">reduction</tspan></text>
<path d="M126.707 33.7071C127.098 33.3166 127.098 32.6834 126.707 32.2929L120.343 25.9289C119.953 25.5384 119.319 25.5384 118.929 25.9289C118.538 26.3195 118.538 26.9526 118.929 27.3431L124.586 33L118.929 38.6569C118.538 39.0474 118.538 39.6805 118.929 40.0711C119.319 40.4616 119.953 40.4616 120.343 40.0711L126.707 33.7071ZM99 34H126V32H99V34Z" fill="black"/>
<rect x="295.5" y="14.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="317" y="38.7637">Clustering</tspan></text>
<path d="M285.707 33.7071C286.098 33.3166 286.098 32.6834 285.707 32.2929L279.343 25.9289C278.953 25.5384 278.319 25.5384 277.929 25.9289C277.538 26.3195 277.538 26.9526 277.929 27.3431L283.586 33L277.929 38.6569C277.538 39.0474 277.538 39.6805 277.929 40.0711C278.319 40.4616 278.953 40.4616 279.343 40.0711L285.707 33.7071ZM258 34H285V32H258V34Z" fill="black"/>
<rect x="442.5" y="14.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="454.873" y="30.7637">Incremental &#10;</tspan><tspan x="452.137" y="44.7637">Bag-of-Words</tspan></text>
<path d="M426.707 33.7071C427.098 33.3166 427.098 32.6834 426.707 32.2929L420.343 25.9289C419.953 25.5384 419.319 25.5384 418.929 25.9289C418.538 26.3195 418.538 26.9526 418.929 27.3431L424.586 33L418.929 38.6569C418.538 39.0474 418.538 39.6805 418.929 40.0711C419.319 40.4616 419.953 40.4616 420.343 40.0711L426.707 33.7071ZM399 34H426V32H399V34Z" fill="black"/>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="618" y="10.9697">c-TF-IDF</tspan></text>
<rect x="592.5" y="14.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="622.404" y="30.7637">Topic &#10;</tspan><tspan x="600.215" y="44.7637">representation</tspan></text>
<path d="M576.707 33.7071C577.098 33.3166 577.098 32.6834 576.707 32.2929L570.343 25.9289C569.953 25.5384 569.319 25.5384 568.929 25.9289C568.538 26.3195 568.538 26.9526 568.929 27.3431L574.586 33L568.929 38.6569C568.538 39.0474 568.538 39.6805 568.929 40.0711C569.319 40.4616 569.953 40.4616 570.343 40.0711L576.707 33.7071ZM549 34H576V32H549V34Z" fill="black"/>
<line x1="134.5" y1="68" x2="134.5" y2="93" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="178.166" y="107.764">Online variants of these steps in the main BERTopic pipeline&#10;</tspan><tspan x="199.107" y="121.764"> are needed in order to enable incremental learning. </tspan></text>
<line x1="544.5" y1="68" x2="544.5" y2="93" stroke="black"/>
<line x1="134" y1="92.5" x2="544" y2="92.5" stroke="black"/>
</svg>

After

Width:  |  Height:  |  Size: 4.4 KiB

File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,196 @@
When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created
that do not fall within any of the created topics. These are labeled as -1. Depending on your use case, you might want
to decrease the number of documents that are labeled as outliers. Fortunately, there are a number of strategies one might
use to reduce the number of outliers after you have trained your BERTopic model.
The main way to reduce your outliers in BERTopic is by using the `.reduce_outliers` function. To make it work without too much tweaking, you will only need to pass the `docs` and their corresponding `topics`. You can pass outlier and non-outlier documents together since it will only try to reduce outlier documents and label them to a non-outlier topic.
The following is a minimal example:
```python
from bertopic import BERTopic
# Train your BERTopic model
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)
# Reduce outliers
new_topics = topic_model.reduce_outliers(docs, topics)
```
!!! note
You can use the `threshold` parameter to select the minimum distance or similarity when matching outlier documents with non-outlier topics. This allows the user to change the amount of outlier documents are assigned to non-outlier topics.
## **Strategies**
The default method for reducing outliers is by calculating the c-TF-IDF representations of outlier documents and assigning them
to the best matching c-TF-IDF representations of non-outlier topics.
However, there are a number of other strategies one can use, either separately or in conjunction that are worthwhile to explore:
* Using the topic-document probabilities to assign topics
* Using the topic-document distributions to assign topics
* Using c-TF-IDF representations to assign topics
* Using document and topic embeddings to assign topics
### **Probabilities**
This strategy uses the soft-clustering as performed by HDBSCAN to find the
best matching topic for each outlier document. To use this, make
sure to calculate the `probabilities` beforehand by instantiating
BERTopic with `calculate_probabilities=True`.
```python
from bertopic import BERTopic
# Train your BERTopic model and calculate the document-topic probabilities
topic_model = BERTopic(calculate_probabilities=True)
topics, probs = topic_model.fit_transform(docs)
# Reduce outliers using the `probabilities` strategy
new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities")
```
### **Topic Distributions**
Use the topic distributions, as calculated with `.approximate_distribution`
to find the most frequent topic in each outlier document. You can use the
`distributions_params` variable to tweak the parameters of
`.approximate_distribution`.
```python
from bertopic import BERTopic
# Train your BERTopic model
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)
# Reduce outliers using the `distributions` strategy
new_topics = topic_model.reduce_outliers(docs, topics, strategy="distributions")
```
### **c-TF-IDF**
Calculate the c-TF-IDF representation for each outlier document and
find the best matching c-TF-IDF topic representation using
cosine similarity.
```python
from bertopic import BERTopic
# Train your BERTopic model
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)
# Reduce outliers using the `c-tf-idf` strategy
new_topics = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf")
```
### **Embeddings**
Using the embeddings of each outlier documents, find the best
matching topic embedding using cosine similarity.
```python
from bertopic import BERTopic
# Train your BERTopic model
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)
# Reduce outliers using the `embeddings` strategy
new_topics = topic_model.reduce_outliers(docs, topics, strategy="embeddings")
```
!!! note
If you have pre-calculated the documents embeddings you can speed up the outlier
reduction process for the `"embeddings"` strategy as it will prevent re-calculating
the document embeddings.
### **Chain Strategies**
Since the `.reduce_outliers` function does not internally update the topics, we can easily try out different strategies but also chain them together.
You might want to do a first pass with the `"c-tf-idf"` strategy as it is quite fast. Then, we can perform the `"distributions"` strategy on the
outliers that are left since this method is typically much slower:
```python
# Use the "c-TF-IDF" strategy with a threshold
new_topics = topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=0.1)
# Reduce all outliers that are left with the "distributions" strategy
new_topics = topic_model.reduce_outliers(docs, new_topics, strategy="distributions")
```
## **Update Topics**
After generating our updated topics, we can feed them back into BERTopic in one of two ways. We can either update the topic representations themselves based on the documents that now belong to new topics or we can only update the topic frequency without updating the topic representations themselves.
!!! warning
In both cases, it is important to realize that
updating the topics this way may lead to errors if topic reduction or topic merging techniques are used afterwards. The reason for this is that when you assign a -1 document to topic 1 and another -1 document to topic 2, it is unclear how you map the -1 documents. Is it matched to topic 1 or 2.
### **Update Topic Representation**
When outlier documents are generated, they are not used when modeling the topic representations. These documents are completely ignored when finding good descriptions of topics. Thus, after having reduced the number of outliers in your topic model, you might want to update the topic representations with the documents that now belong to actual topics. To do so, we can make use of the `.update_topics` function:
```python
topic_model.update_topics(docs, topics=new_topics)
```
As seen above, you will only need to pass the documents on which the model was trained including the new topics that were generated using one of the above four strategies.
### **Exploration**
When you are reducing the number of topics, it might be worthwhile to iteratively visualize the results in order to get an intuitive understanding of the effect of the above four strategies. Making use of `.visualize_documents`, we can quickly iterate over the different strategies and view their effects. Here, an example will be shown on how to approach such a pipeline.
First, we train our model:
```python
from umap import UMAP
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
# Prepare data, extract embeddings, and prepare sub-models
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
vectorizer_model = CountVectorizer(stop_words="english")
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=True)
# We reduce our embeddings to 2D as it will allows us to quickly iterate later on
reduced_embeddings = UMAP(n_neighbors=10, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(embeddings)
# Train our topic model
topic_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model,
vectorizer_model=vectorizer_model, calculate_probabilities=True, nr_topics=40)
topics, probs = topic_model.fit_transform(docs, embeddings)
```
After having trained our model, let us take a look at the 2D representation of the generated topics:
```python
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
```
<iframe src="fig_base.html" style="width:800px; height: 800px; border: 0px;""></iframe>
Next, we reduce the number of outliers using the `probabilities` strategy:
```python
new_topics = reduce_outliers(topic_model, docs, topics, probabilities=probs,
threshold=0.05, strategy="probabilities")
topic_model.update_topics(docs, topics=new_topics)
```
And finally, we visualize the results:
```python
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
```
<iframe src="fig_reduced.html" style="width:800px; height: 800px; border: 0px;""></iframe>
@@ -0,0 +1,105 @@
# Hyperparameter Tuning
Although BERTopic works quite well out of the box, there are a number of hyperparameters to tune according to your use case.
This section will focus on important parameters directly accessible in BERTopic but also hyperparameter optimization in sub-models
such as HDBSCAN and UMAP.
## **BERTopic**
When instantiating BERTopic, there are several hyperparameters that you can directly adjust that could significantly improve the performance of your topic model. In this section, we will go through the most impactful parameters in BERTopic and directions on how to optimize them.
### **language**
The `language` parameter is used to simplify the selection of models for those who are not familiar with sentence-transformers models.
In essence, there are two options to choose from:
* `language = "english"` or
* `language = "multilingual"`
The English model is "all-MiniLM-L6-v2" and can be found [here](https://www.sbert.net/docs/pretrained_models.html). It is the default model that is used in BERTopic and works great for English documents.
The multilingual model is "paraphrase-multilingual-MiniLM-L12-v2" and supports over 50+ languages which can be found [here](https://www.sbert.net/docs/pretrained_models.html). The model is very similar to the base model but is trained on many languages and has a slightly different architecture.
### **top_n_words**
`top_n_words` refers to the number of words per topic that you want to be extracted. In practice, I would advise you to keep this value below 30 and preferably between 10 and 20. The reasoning for this is that the more words you put in a topic the less coherent it can become. The top words are the most representative of the topic and should be focused on.
### **n_gram_range**
The `n_gram_range` parameter refers to the CountVectorizer used when creating the topic representation. It relates to the number of words you want in your topic representation. For example, "New" and "York" are two separate words but are often used as "New York" which represents an n-gram of 2. Thus, the `n_gram_range` should be set to (1, 2) if you want "New York" in your topic representation.
### **min_topic_size**
`min_topic_size` is an important parameter! It is used to specify what the minimum size of a topic can be. The lower this value the more topics are created. If you set this value too high, then it is possible that simply no topics will be created! Set this value too low and you will get many microclusters.
It is advised to play around with this value depending on the size of your dataset. If it nears a million documents, then it is advised to set it much higher than the default of 10, for example, 100 or even 500.
### **nr_topics**
`nr_topics` can be a tricky parameter. It specifies, after training the topic model, the number of topics that will be reduced. For example, if your topic model results in 100 topics but you have set `nr_topics` to 20 then the topic model will try to reduce the number of topics from 100 to 20.
This reduction can take a while as each reduction in topics activates a c-TF-IDF calculation. If this is set to None, no reduction is applied. Use "auto" to automatically reduce topics using HDBSCAN.
### **low_memory**
`low_memory` sets UMAP's `low_memory` to True to make sure that less memory is used in the computation. This slows down computation but allows UMAP to be run on low-memory machines.
### **calculate_probabilities**
`calculate_probabilities` lets you calculate the probabilities of each topic in each document. This is computationally quite expensive and is turned off by default.
## **UMAP**
UMAP is an amazing technique for dimensionality reduction. In BERTopic, it is used to reduce the dimensionality of document embedding into something easier to use with HDBSCAN to create good clusters.
However, it does has a significant number of parameters you could take into account. As exposing all parameters in BERTopic would be difficult to manage, we can instantiate our UMAP model and pass it to BERTopic:
```python
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False)
topic_model = BERTopic(umap_model=umap_model).fit(docs)
```
### **n_neighbors**
`n_neighbors` is the number of neighboring sample points used when making the manifold approximation. Increasing this value typically results in a
more global view of the embedding structure whilst smaller values result in a more local view. Increasing this value often results in larger clusters
being created.
### **n_components**
`n_components` refers to the dimensionality of the embeddings after reducing them. This is set as a default to `5` to reduce dimensionality
as much as possible whilst trying to maximize the information kept in the resulting embeddings. Although lowering or increasing this value influences the quality of embeddings, its effect is largest on the performance of HDBSCAN. Increasing this value too much and HDBSCAN will have a
hard time clustering the high-dimensional embeddings. Lower this value too much and too little information in the resulting embeddings are available
to create proper clusters. If you want to increase this value, I would advise setting using a metric for HDBSCAN that works well in high dimensional data.
### **metric**
`metric` refers to the method used to compute the distances in high dimensional space. The default is `cosine` as we are dealing with high dimensional data. However, BERTopic is also able to use any input, even regular tabular data, to cluster the documents. Thus, you might want to change the metric
to something that fits your use case.
### **low_memory**
`low_memory` is used when datasets may consume a lot of memory. Using millions of documents can lead to memory issues and setting this value to `True`
might alleviate some of the issues.
## **HDBSCAN**
After reducing the embeddings with UMAP, we use HDBSCAN to cluster our documents into clusters of similar documents. Similar to UMAP, HDBSCAN has many parameters that could be tweaked to improve the cluster's quality.
```python
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True)
topic_model = BERTopic(hdbscan_model=hdbscan_model).fit(docs)
```
### **min_cluster_size**
`min_cluster_size` is arguably the most important parameter in HDBSCAN. It controls the minimum size of a cluster and thereby the number of clusters
that will be generated. It is set to `10` as a default. Increasing this value results in fewer clusters but of larger size whereas decreasing this value
results in more micro clusters being generated. Typically, I would advise increasing this value rather than decreasing it.
### **min_samples**
`min_samples` is automatically set to `min_cluster_size` and controls the number of outliers generated. Setting this value significantly lower than
`min_cluster_size` might help you reduce the amount of noise you will get. Do note that outliers are to be expected and forcing the output
to have no outliers may not properly represent the data.
### **metric**
`metric`, like with HDBSCAN is used to calculate the distances. Here, we went with `euclidean` as, after reducing the dimensionality, we have
low dimensional data and not much optimization is necessary. However, if you increase `n_components` in UMAP, then it would be advised to look into
metrics that work with high dimensional data.
### **prediction_data**
Make sure you always set this value to `True` as it is needed to predict new points later on. You can set this to False if you do not wish to predict
any unseen data points.
@@ -0,0 +1,182 @@
## **Installation**
Installation, with sentence-transformers, can be done using [pypi](https://pypi.org/project/bertopic/):
```bash
pip install bertopic
```
You may want to install more depending on the transformers and language backends that you will be using.
The possible installations are:
```bash
# Choose an embedding backend
pip install bertopic[flair, gensim, spacy, use]
# Topic modeling with images
pip install bertopic[vision]
```
## **Quick Start**
We start by extracting topics from the well-known 20 newsgroups dataset which is comprised of English documents:
```python
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)
```
After generating topics, we can access the frequent topics that were generated:
```python
>>> topic_model.get_topic_info()
Topic Count Name
-1 4630 -1_can_your_will_any
0 693 49_windows_drive_dos_file
1 466 32_jesus_bible_christian_faith
2 441 2_space_launch_orbit_lunar
3 381 22_key_encryption_keys_encrypted
```
-1 refers to all outliers and should typically be ignored. Next, let's take a look at the most
frequent topic that was generated, topic 0:
```python
>>> topic_model.get_topic(0)
[('windows', 0.006152228076250982),
('drive', 0.004982897610645755),
('dos', 0.004845038866360651),
('file', 0.004140142872194834),
('disk', 0.004131678774810884),
('mac', 0.003624848635985097),
('memory', 0.0034840976976789903),
('software', 0.0034415334250699077),
('email', 0.0034239554442333257),
('pc', 0.003047105930670237)]
```
Using `.get_document_info`, we can also extract information on a document level, such as their corresponding topics, probabilities, whether they are representative documents for a topic, etc.:
```python
>>> topic_model.get_document_info(docs)
Document Topic Name Top_n_words Probability ...
I am sure some bashers of Pens... 0 0_game_team_games_season game - team - games... 0.200010 ...
My brother is in the market for... -1 -1_can_your_will_any can - your - will... 0.420668 ...
Finally you said what you dream... -1 -1_can_your_will_any can - your - will... 0.807259 ...
Think! It is the SCSI card doing... 49 49_windows_drive_dos_file windows - drive - docs... 0.071746 ...
1) I have an old Jasmine drive... 49 49_windows_drive_dos_file windows - drive - docs... 0.038983 ...
```
!!! Tip "Multilingual"
Use `BERTopic(language="multilingual")` to select a model that supports 50+ languages.
## **Fine-tune Topic Representations**
In BERTopic, there are a number of different [topic representations](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html) that we can choose from. They are all quite different from one another and give interesting perspectives and variations of topic representations. A great start is `KeyBERTInspired`, which for many users increases the coherence and reduces stopwords from the resulting topic representations:
```python
from bertopic.representation import KeyBERTInspired
# Fine-tune your topic representations
representation_model = KeyBERTInspired()
topic_model = BERTopic(representation_model=representation_model)
```
However, you might want to use something more powerful to describe your clusters. You can even use ChatGPT or other models from OpenAI to generate labels, summaries, phrases, keywords, and more:
```python
import openai
from bertopic.representation import OpenAI
# Fine-tune topic representations with GPT
client = openai.OpenAI(api_key="sk-...")
representation_model = OpenAI(client, model="gpt-4o-mini", chat=True)
topic_model = BERTopic(representation_model=representation_model)
```
!!! tip "Multi-aspect Topic Modeling"
Instead of iterating over all of these different topic representations, you can model them simultaneously with [multi-aspect topic representations](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) in BERTopic.
## **Visualizations**
After having trained our BERTopic model, we can iteratively go through hundreds of topics to get a good
understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can use one of the [many visualization options](https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html) in BERTopic. For example, we can visualize the topics that were generated in a way very similar to
[LDAvis](https://github.com/cpsievert/LDAvis):
```python
topic_model.visualize_topics()
```
<iframe src="viz.html" style="width:1000px; height: 680px; border: 0px;""></iframe>
## **Save/Load BERTopic model**
There are three methods for saving BERTopic:
1. A light model with `.safetensors` and config files
2. A light model with pytorch `.bin` and config files
3. A full model with `.pickle`
Method 3 allows for saving the entire topic model but has several drawbacks:
* Arbitrary code can be run from `.pickle` files
* The resulting model is rather large (often > 500MB) since all sub-models need to be saved
* Explicit and specific version control is needed as they typically only run if the environment is exactly the same
> **It is advised to use methods 1 or 2 for saving.**
These methods have a number of advantages:
* `.safetensors` is a relatively **safe format**
* The resulting model can be **very small** (often < 20MB) since no sub-models need to be saved
* Although version control is important, there is a bit more **flexibility** with respect to specific versions of packages
* More easily used in **production**
* **Share** models with the HuggingFace Hub
!!! Tip "Tip"
For more detail about how to load in a custom vectorizer, representation model, and more, it is highly advised to checkout the [serialization](https://maartengr.github.io/BERTopic/getting_started/serialization/serialization.html) page. It contains more examples, details, and some tips and tricks for loading and saving your environment.
The methods are as used as follows:
```python
topic_model = BERTopic().fit(my_docs)
# Method 1 - safetensors
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("path/to/my/model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)
# Method 2 - pytorch
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("path/to/my/model_dir", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)
# Method 3 - pickle
topic_model.save("my_model", serialization="pickle")
```
To load a model:
```python
# Load from directory
loaded_model = BERTopic.load("path/to/my/model_dir")
# Load from file
loaded_model = BERTopic.load("my_model")
# Load from HuggingFace
loaded_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")
```
!!! Warning "Warning"
When saving the model, make sure to also keep track of the versions of dependencies and Python used.
Loading and saving the model should be done using the same dependencies and Python. Moreover, models
saved in one version of BERTopic should not be loaded in other versions.
File diff suppressed because one or more lines are too long
@@ -0,0 +1,13 @@
<svg width="705" height="199" viewBox="0 0 705 199" fill="none" xmlns="http://www.w3.org/2000/svg">
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="0" y="47.7637">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="0" y="19.7637">&#10;</tspan><tspan x="18" y="33.7637">meat | organic | food | beef | emissions | eat | of | eating | is&#10;</tspan><tspan x="18" y="61.7637">the | explosion | atmosphere | eruption | kilometers | of | &#10;</tspan><tspan x="0" y="75.7637">&#10;</tspan><tspan x="18" y="89.7637">immune | system | your | cells | my | and | is | the | how | of&#10;</tspan><tspan x="0" y="103.764">&#10;</tspan><tspan x="18" y="117.764">moon | earth | lunar | tides | the | water | orbit | base | moons &#10;</tspan><tspan x="0" y="131.764">&#10;</tspan><tspan x="18" y="145.764">eu | european | democratic | vote | parliament | member | union&#10;</tspan><tspan x="0" y="159.764">&#10;</tspan><tspan x="18" y="173.764">plastic | plastics | tons | pollution | waste | microplastics | polymers</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="6" y="14.0576">Default Representation</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="452" y="19.7637">&#10;</tspan><tspan x="470" y="33.7637">Organic food&#10;</tspan><tspan x="452" y="47.7637">&#10;</tspan><tspan x="470" y="61.7637">Exploding planets&#10;</tspan><tspan x="452" y="75.7637">&#10;</tspan><tspan x="470" y="89.7637">How your immune system works&#10;</tspan><tspan x="452" y="103.764">&#10;</tspan><tspan x="470" y="117.764">How tides work&#10;</tspan><tspan x="452" y="131.764">&#10;</tspan><tspan x="470" y="145.764">How democratic is the European Union?&#10;</tspan><tspan x="452" y="159.764">&#10;</tspan><tspan x="470" y="173.764">Plastic pollution&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="458" y="18.0576">Cohere</tspan></text>
<path d="M448.354 170.354C448.549 170.158 448.549 169.842 448.354 169.646L445.172 166.464C444.976 166.269 444.66 166.269 444.464 166.464C444.269 166.66 444.269 166.976 444.464 167.172L447.293 170L444.464 172.828C444.269 173.024 444.269 173.34 444.464 173.536C444.66 173.731 444.976 173.731 445.172 173.536L448.354 170.354ZM418 170.5H448V169.5H418V170.5Z" fill="black"/>
<path d="M448.354 143.354C448.549 143.158 448.549 142.842 448.354 142.646L445.172 139.464C444.976 139.269 444.66 139.269 444.464 139.464C444.269 139.66 444.269 139.976 444.464 140.172L447.293 143L444.464 145.828C444.269 146.024 444.269 146.34 444.464 146.536C444.66 146.731 444.976 146.731 445.172 146.536L448.354 143.354ZM418 143.5H448V142.5H418V143.5Z" fill="black"/>
<path d="M448.354 114.354C448.549 114.158 448.549 113.842 448.354 113.646L445.172 110.464C444.976 110.269 444.66 110.269 444.464 110.464C444.269 110.66 444.269 110.976 444.464 111.172L447.293 114L444.464 116.828C444.269 117.024 444.269 117.34 444.464 117.536C444.66 117.731 444.976 117.731 445.172 117.536L448.354 114.354ZM418 114.5H448V113.5H418V114.5Z" fill="black"/>
<path d="M448.354 86.3536C448.549 86.1583 448.549 85.8417 448.354 85.6464L445.172 82.4645C444.976 82.2692 444.66 82.2692 444.464 82.4645C444.269 82.6597 444.269 82.9763 444.464 83.1716L447.293 86L444.464 88.8284C444.269 89.0237 444.269 89.3403 444.464 89.5355C444.66 89.7308 444.976 89.7308 445.172 89.5355L448.354 86.3536ZM418 86.5H448V85.5H418V86.5Z" fill="black"/>
<path d="M448.354 58.3536C448.549 58.1583 448.549 57.8417 448.354 57.6464L445.172 54.4645C444.976 54.2692 444.66 54.2692 444.464 54.4645C444.269 54.6597 444.269 54.9763 444.464 55.1716L447.293 58L444.464 60.8284C444.269 61.0237 444.269 61.3403 444.464 61.5355C444.66 61.7308 444.976 61.7308 445.172 61.5355L448.354 58.3536ZM418 58.5H448V57.5H418V58.5Z" fill="black"/>
<path d="M448.354 30.3536C448.549 30.1583 448.549 29.8417 448.354 29.6464L445.172 26.4645C444.976 26.2692 444.66 26.2692 444.464 26.4645C444.269 26.6597 444.269 26.9763 444.464 27.1716L447.293 30L444.464 32.8284C444.269 33.0237 444.269 33.3403 444.464 33.5355C444.66 33.7308 444.976 33.7308 445.172 33.5355L448.354 30.3536ZM418 30.5H448V29.5H418V30.5Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 4.5 KiB

@@ -0,0 +1,13 @@
<svg width="687" height="199" viewBox="0 0 687 199" fill="none" xmlns="http://www.w3.org/2000/svg">
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="0" y="19.8636">&#10;</tspan><tspan x="18" y="34.8636">meat | organic | food | beef | emissions | eat | of | eating | is&#10;</tspan><tspan x="18" y="64.8636">the | explosion | atmosphere | eruption | kilometers | of | &#10;</tspan><tspan x="0" y="79.8636">&#10;</tspan><tspan x="18" y="94.8636">immune | system | your | cells | my | and | is | the | how | of&#10;</tspan><tspan x="0" y="109.864">&#10;</tspan><tspan x="18" y="124.864">moon | earth | lunar | tides | the | water | orbit | base | moons &#10;</tspan><tspan x="0" y="139.864">&#10;</tspan><tspan x="18" y="154.864">eu | european | democratic | vote | parliament | member | union&#10;</tspan><tspan x="0" y="169.864">&#10;</tspan><tspan x="18" y="184.864">plastic | plastics | tons | pollution | waste | microplastics | polymers</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="0" y="49.8636">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="6" y="13.5909">Default Representation</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="452" y="19.8636">&#10;</tspan><tspan x="470" y="34.8636">beef&#10;</tspan><tspan x="452" y="49.8636">&#10;</tspan><tspan x="470" y="64.8636">volcanoes&#10;</tspan><tspan x="452" y="79.8636">&#10;</tspan><tspan x="470" y="94.8636">immune system&#10;</tspan><tspan x="452" y="109.864">&#10;</tspan><tspan x="470" y="124.864">earth&#10;</tspan><tspan x="452" y="139.864">&#10;</tspan><tspan x="470" y="154.864">european union&#10;</tspan><tspan x="452" y="169.864">&#10;</tspan><tspan x="470" y="184.864">cotton</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="458" y="17.5909">&#x1f917; Transformers</tspan></text>
<path d="M448.354 182.354C448.549 182.158 448.549 181.842 448.354 181.646L445.172 178.464C444.976 178.269 444.66 178.269 444.464 178.464C444.269 178.66 444.269 178.976 444.464 179.172L447.293 182L444.464 184.828C444.269 185.024 444.269 185.34 444.464 185.536C444.66 185.731 444.976 185.731 445.172 185.536L448.354 182.354ZM418 182.5H448V181.5H418V182.5Z" fill="black"/>
<path d="M448.354 152.354C448.549 152.158 448.549 151.842 448.354 151.646L445.172 148.464C444.976 148.269 444.66 148.269 444.464 148.464C444.269 148.66 444.269 148.976 444.464 149.172L447.293 152L444.464 154.828C444.269 155.024 444.269 155.34 444.464 155.536C444.66 155.731 444.976 155.731 445.172 155.536L448.354 152.354ZM418 152.5H448V151.5H418V152.5Z" fill="black"/>
<path d="M448.354 122.354C448.549 122.158 448.549 121.842 448.354 121.646L445.172 118.464C444.976 118.269 444.66 118.269 444.464 118.464C444.269 118.66 444.269 118.976 444.464 119.172L447.293 122L444.464 124.828C444.269 125.024 444.269 125.34 444.464 125.536C444.66 125.731 444.976 125.731 445.172 125.536L448.354 122.354ZM418 122.5H448V121.5H418V122.5Z" fill="black"/>
<path d="M448.354 92.3536C448.549 92.1583 448.549 91.8417 448.354 91.6464L445.172 88.4645C444.976 88.2692 444.66 88.2692 444.464 88.4645C444.269 88.6597 444.269 88.9763 444.464 89.1716L447.293 92L444.464 94.8284C444.269 95.0237 444.269 95.3403 444.464 95.5355C444.66 95.7308 444.976 95.7308 445.172 95.5355L448.354 92.3536ZM418 92.5H448V91.5H418V92.5Z" fill="black"/>
<path d="M448.354 62.3536C448.549 62.1583 448.549 61.8417 448.354 61.6464L445.172 58.4645C444.976 58.2692 444.66 58.2692 444.464 58.4645C444.269 58.6597 444.269 58.9763 444.464 59.1716L447.293 62L444.464 64.8284C444.269 65.0237 444.269 65.3403 444.464 65.5355C444.66 65.7308 444.976 65.7308 445.172 65.5355L448.354 62.3536ZM418 62.5H448V61.5H418V62.5Z" fill="black"/>
<path d="M448.354 32.3536C448.549 32.1583 448.549 31.8417 448.354 31.6464L445.172 28.4645C444.976 28.2692 444.66 28.2692 444.464 28.4645C444.269 28.6597 444.269 28.9763 444.464 29.1716L447.293 32L444.464 34.8284C444.269 35.0237 444.269 35.3403 444.464 35.5355C444.66 35.7308 444.976 35.7308 445.172 35.5355L448.354 32.3536ZM418 32.5H448V31.5H418V32.5Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 4.4 KiB

@@ -0,0 +1,13 @@
<svg width="907" height="199" viewBox="0 0 907 199" fill="none" xmlns="http://www.w3.org/2000/svg">
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="0" y="19.8636">&#10;</tspan><tspan x="18" y="34.8636">meat | organic | food | beef | emissions | eat | of | eating | is&#10;</tspan><tspan x="18" y="64.8636">the | explosion | atmosphere | eruption | kilometers | of | &#10;</tspan><tspan x="0" y="79.8636">&#10;</tspan><tspan x="18" y="94.8636">immune | system | your | cells | my | and | is | the | how | of&#10;</tspan><tspan x="0" y="109.864">&#10;</tspan><tspan x="18" y="124.864">moon | earth | lunar | tides | the | water | orbit | base | moons &#10;</tspan><tspan x="0" y="139.864">&#10;</tspan><tspan x="18" y="154.864">eu | european | democratic | vote | parliament | member | union&#10;</tspan><tspan x="0" y="169.864">&#10;</tspan><tspan x="18" y="184.864">plastic | plastics | tons | pollution | waste | microplastics | polymers</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="0" y="49.8636">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="6" y="13.5909">Default Representation</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="452" y="19.8636">&#10;</tspan><tspan x="470" y="34.8636">organic | meat | foods | crops | beef | produce | food | diet | cows | eating&#10;</tspan><tspan x="452" y="49.8636">&#10;</tspan><tspan x="470" y="64.8636">explosion | explodes | eruptions | eruption | blast | volcanoes | volcanic&#10;</tspan><tspan x="452" y="79.8636">&#10;</tspan><tspan x="470" y="94.8636">immune | immunology | antibodies | disease | cells | infection | cell | system &#10;</tspan><tspan x="452" y="109.864">&#10;</tspan><tspan x="470" y="124.864">moon | moons | lunar | tides | tidal | gravity | orbit | satellites | earth | orbits&#10;</tspan><tspan x="452" y="139.864">&#10;</tspan><tspan x="470" y="154.864">eu | democracy | european | democratic | parliament | governments | voting&#10;</tspan><tspan x="452" y="169.864">&#10;</tspan><tspan x="470" y="184.864">plastics | plastic | pollution | microplastics | environmental | polymers | bpa</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="458" y="17.5909">KeyBERT-Inspired</tspan></text>
<path d="M448.354 182.354C448.549 182.158 448.549 181.842 448.354 181.646L445.172 178.464C444.976 178.269 444.66 178.269 444.464 178.464C444.269 178.66 444.269 178.976 444.464 179.172L447.293 182L444.464 184.828C444.269 185.024 444.269 185.34 444.464 185.536C444.66 185.731 444.976 185.731 445.172 185.536L448.354 182.354ZM418 182.5H448V181.5H418V182.5Z" fill="black"/>
<path d="M448.354 152.354C448.549 152.158 448.549 151.842 448.354 151.646L445.172 148.464C444.976 148.269 444.66 148.269 444.464 148.464C444.269 148.66 444.269 148.976 444.464 149.172L447.293 152L444.464 154.828C444.269 155.024 444.269 155.34 444.464 155.536C444.66 155.731 444.976 155.731 445.172 155.536L448.354 152.354ZM418 152.5H448V151.5H418V152.5Z" fill="black"/>
<path d="M448.354 122.354C448.549 122.158 448.549 121.842 448.354 121.646L445.172 118.464C444.976 118.269 444.66 118.269 444.464 118.464C444.269 118.66 444.269 118.976 444.464 119.172L447.293 122L444.464 124.828C444.269 125.024 444.269 125.34 444.464 125.536C444.66 125.731 444.976 125.731 445.172 125.536L448.354 122.354ZM418 122.5H448V121.5H418V122.5Z" fill="black"/>
<path d="M448.354 92.3536C448.549 92.1583 448.549 91.8417 448.354 91.6464L445.172 88.4645C444.976 88.2692 444.66 88.2692 444.464 88.4645C444.269 88.6597 444.269 88.9763 444.464 89.1716L447.293 92L444.464 94.8284C444.269 95.0237 444.269 95.3403 444.464 95.5355C444.66 95.7308 444.976 95.7308 445.172 95.5355L448.354 92.3536ZM418 92.5H448V91.5H418V92.5Z" fill="black"/>
<path d="M448.354 62.3536C448.549 62.1583 448.549 61.8417 448.354 61.6464L445.172 58.4645C444.976 58.2692 444.66 58.2692 444.464 58.4645C444.269 58.6597 444.269 58.9763 444.464 59.1716L447.293 62L444.464 64.8284C444.269 65.0237 444.269 65.3403 444.464 65.5355C444.66 65.7308 444.976 65.7308 445.172 65.5355L448.354 62.3536ZM418 62.5H448V61.5H418V62.5Z" fill="black"/>
<path d="M448.354 32.3536C448.549 32.1583 448.549 31.8417 448.354 31.6464L445.172 28.4645C444.976 28.2692 444.66 28.2692 444.464 28.4645C444.269 28.6597 444.269 28.9763 444.464 29.1716L447.293 32L444.464 34.8284C444.269 35.0237 444.269 35.3403 444.464 35.5355C444.66 35.7308 444.976 35.7308 445.172 35.5355L448.354 32.3536ZM418 32.5H448V31.5H418V32.5Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 4.8 KiB

@@ -0,0 +1,23 @@
<svg width="508" height="269" viewBox="0 0 508 269" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="199.5" y="0.5" width="69" height="39" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="246.262" y="23.7637"> n</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="214" y="23.7637">Topic</tspan></text>
<rect x="253.5" y="76.5" width="142" height="39" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="261.24" y="92.8636">Extract representative </tspan><tspan x="292.383" y="107.864">documents</tspan></text>
<rect x="78.5" y="153.5" width="142" height="39" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="99.4297" y="169.864">Embed candidate </tspan><tspan x="121.801" y="184.864">keywords</tspan></text>
<rect x="135.5" y="229.5" width="204" height="39" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="149.789" y="245.864">Compare embedded keywords </tspan><tspan x="159.1" y="260.864">with embedded documents</tspan></text>
<rect x="255.5" y="153.5" width="142" height="39" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="292.262" y="169.864">Embed and &#10;</tspan><tspan x="269.955" y="184.864">average documents </tspan></text>
<path d="M189.078 64.6133C188.864 65.1226 189.104 65.7086 189.613 65.9222L197.913 69.4027C198.422 69.6163 199.008 69.3766 199.222 68.8673C199.436 68.358 199.196 67.7719 198.686 67.5583L191.309 64.4645L194.403 57.087C194.616 56.5777 194.377 55.9916 193.867 55.7781C193.358 55.5645 192.772 55.8042 192.558 56.3135L189.078 64.6133ZM233.621 46.0745L189.621 64.0745L190.379 65.9255L234.379 47.9255L233.621 46.0745Z" fill="black"/>
<path d="M278.922 64.6133C279.136 65.1226 278.896 65.7086 278.387 65.9222L270.087 69.4027C269.578 69.6163 268.992 69.3766 268.778 68.8673C268.564 68.358 268.804 67.7719 269.314 67.5583L276.691 64.4645L273.597 57.087C273.384 56.5777 273.623 55.9916 274.133 55.7781C274.642 55.5645 275.228 55.8042 275.442 56.3135L278.922 64.6133ZM234.379 46.0745L278.379 64.0745L277.621 65.9255L233.621 47.9255L234.379 46.0745Z" fill="black"/>
<path d="M281.078 219.613C280.864 220.123 281.104 220.709 281.613 220.922L289.913 224.403C290.422 224.616 291.008 224.377 291.222 223.867C291.436 223.358 291.196 222.772 290.686 222.558L283.309 219.465L286.403 212.087C286.616 211.578 286.377 210.992 285.867 210.778C285.358 210.564 284.772 210.804 284.558 211.314L281.078 219.613ZM325.621 201.074L281.621 219.074L282.379 220.926L326.379 202.926L325.621 201.074Z" fill="black"/>
<path d="M195.922 219.613C196.136 220.123 195.896 220.709 195.387 220.922L187.087 224.403C186.578 224.616 185.992 224.377 185.778 223.867C185.564 223.358 185.804 222.772 186.314 222.558L193.691 219.465L190.597 212.087C190.384 211.578 190.623 210.992 191.133 210.778C191.642 210.564 192.228 210.804 192.442 211.314L195.922 219.613ZM151.379 201.074L195.379 219.074L194.621 220.926L150.621 202.926L151.379 201.074Z" fill="black"/>
<path d="M150.293 149.707C150.683 150.098 151.317 150.098 151.707 149.707L158.071 143.343C158.462 142.953 158.462 142.319 158.071 141.929C157.681 141.538 157.047 141.538 156.657 141.929L151 147.586L145.343 141.929C144.953 141.538 144.319 141.538 143.929 141.929C143.538 142.319 143.538 142.953 143.929 143.343L150.293 149.707ZM150 123L150 149L152 149L152 123L150 123Z" fill="black"/>
<path d="M325.293 149.707C325.683 150.098 326.317 150.098 326.707 149.707L333.071 143.343C333.462 142.953 333.462 142.319 333.071 141.929C332.681 141.538 332.047 141.538 331.657 141.929L326 147.586L320.343 141.929C319.953 141.538 319.319 141.538 318.929 141.929C318.538 142.319 318.538 142.953 318.929 143.343L325.293 149.707ZM325 123L325 149L327 149L327 123L325 123Z" fill="black"/>
<rect x="78.5" y="76.5" width="142" height="39" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="99.0254" y="92.8636">Extract candidate </tspan><tspan x="121.801" y="107.864">keywords</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="296" y="51.8636">Compare c-TF-IDF sampled </tspan><tspan x="296" y="66.8636">documents with the topic c-TF-IDF. </tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="0" y="50.7637">Extract top n words per topic </tspan><tspan x="0" y="64.7637">based on their c-TF-IDF scores</tspan></text>
</svg>

After

Width:  |  Height:  |  Size: 5.0 KiB

@@ -0,0 +1,619 @@
As we have seen in the [previous section](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html), the topics that you get from BERTopic can be fine-tuned using a number of approaches. Here, we are going to focus on text generation Large Language Models such as ChatGPT, GPT-4, and open-source solutions.
Using these techniques, we can further fine-tune topics to generate labels, summaries, poems of topics, and more. To do so, we first generate a set of keywords and documents that describe a topic best using BERTopic's c-TF-IDF calculate. Then, these candidate keywords and documents are passed to the text generation model and asked to generate output that fits the topic best.
A huge benefit of this is that we can describe a topic with only a few documents and we therefore do not need to pass all documents to the text generation model. Not only speeds this the generation of topic labels up significantly, you also do not need a massive amount of credits when using an external API, such as Cohere or OpenAI.
## **Prompt Engineering**
In most of the examples below, we use certain tags to customize our prompts. There are currently two tags, namely `"[KEYWORDS]"` and `"[DOCUMENTS]"`.
These tags indicate where in the prompt they are to be replaced with a topics keywords and top 4 most representative documents respectively.
For example, if we have the following prompt:
```python
prompt = """
I have topic that contains the following documents: \n[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
Based on the above information, can you give a short label of the topic?
"""
```
then that will be rendered as follows:
```python
"""
I have a topic that contains the following documents:
- Our videos are also made possible by your support on patreon.co.
- If you want to help us make more videos, you can do so on patreon.com or get one of our posters from our shop.
- If you want to help us make more videos, you can do so there.
- And if you want to support us in our endeavor to survive in the world of online video, and make more videos, you can do so on patreon.com.
The topic is described by the following keywords: videos video you our support want this us channel patreon make on we if facebook to patreoncom can for and more watch
Based on the above information, can you give a short label of the topic?
"""
```
!!! tip "Tip 1"
You can access the default prompts of these models with `representation_model.default_prompt_`. The prompts that were generated after training can be accessed with `topic_model.representation_model.prompts_`.
### **Selecting Documents**
By default, four of the most representative documents will be passed to `[DOCUMENTS]`. These documents are selected by calculating their similarity (through c-TF-IDF representations) with the main c-TF-IDF representation of the topics. The four best matching documents per topic are selected.
To increase the number of documents passed to `[DOCUMENTS]`, we can use the `nr_docs` parameter which is accessible in all LLMs on this page. Using this value allows you to select the top *n* most representative documents instead. If you have a long enough context length, then you could even give the LLM dozens of documents.
However, some of these documents might be very similar to one another and might be near duplicates. They will not provide much additional information about the content of the topic. Instead, we can use the `diversity` parameter in each LLM to only select documents that are sufficiently diverse. It takes values between 0 and 1 but a value of 0.1 already does wonders!
### **Truncating Documents**
We can truncate the input documents in `[DOCUMENTS]` in order to reduce the number of tokens that we have in our input prompt. To do so, all text generation modules have two parameters that we can tweak:
* `doc_length`
* The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.
* `tokenizer`
* The tokenizer used to calculate to split the document into segments used to count the length of a document.
* If tokenizer is `'char'`, then the document is split up into characters which are counted to adhere to `doc_length`
* If tokenizer is `'whitespace'`, the document is split up into words separated by whitespaces. These words are counted and truncated depending on `doc_length`
* If tokenizer is `'vectorizer'`, then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on `doc_length`
* If tokenizer is a callable, then that callable is used to tokenized the document. These tokens are counted and truncated depending on `doc_length`
This means that the definition of `doc_length` changes depending on what constitutes a token in the `tokenizer` parameter. If a token is a character, then `doc_length` refers to max length in characters. If a token is a word, then `doc_length` refers to the max length in words.
Let's illustrate this with an example. In the code below, we will use [`tiktoken`](https://github.com/openai/tiktoken) to count the number of tokens in each document and limit them to 100 tokens. All documents that have more than 100 tokens will be truncated.
We start by installing the relevant packages:
```bash
pip install tiktoken openai
```
Then, we use `bertopic.representation.OpenAI` to represent our topics with nicely written labels. We specify that documents that we put in the prompt cannot exceed 100 tokens each. Since we will put 4 documents in the prompt, they will total roughly 400 tokens:
```python
import openai
import tiktoken
from bertopic.representation import OpenAI
from bertopic import BERTopic
# Tokenizer
tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo")
# Create your representation model
client = openai.OpenAI(api_key="sk-...")
representation_model = OpenAI(
client,
model="gpt-3.5-turbo",
delay_in_seconds=2,
chat=True,
nr_docs=4,
doc_length=100,
tokenizer=tokenizer
)
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
```
## **🤗 Transformers**
Nearly every week, there are new and improved models released on the 🤗 [Model Hub](https://huggingface.co/models) that, with some creativity, allow for
further fine-tuning of our c-TF-IDF based topics. These models range from text generation to zero-classification. In BERTopic, wrappers around these
methods are created as a way to support whatever might be released in the future.
Using a GPT-like model from the huggingface hub is rather straightforward:
```python
from bertopic.representation import TextGeneration
from bertopic import BERTopic
# Create your representation model
representation_model = TextGeneration('gpt2')
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
```
GPT2, however, is not the most accurate model out there on HuggingFace models. You can get
much better results with a `flan-T5` like model:
```python
from transformers import pipeline
from bertopic.representation import TextGeneration
prompt = "I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?"
# Create your representation model
generator = pipeline('text2text-generation', model='google/flan-t5-base')
representation_model = TextGeneration(generator)
```
<br>
<div class="svg_image">
--8<-- "docs/getting_started/representation/hf.svg"
</div>
<br>
As can be seen from the example above, if you would like to use a `text2text-generation` model, you will to
pass a `transformers.pipeline` with the `"text2text-generation"` parameter. Moreover, you can use a custom prompt and decide where the keywords should
be inserted by using the `[KEYWORDS]` or documents with the `[DOCUMENTS]` tag.
### **Mistral (GGUF)**
We can go a step further with open-source Large Language Models (LLMs) that have shown to match the performance of closed-source LLMs like ChatGPT.
In this example, we will show you how to use Zephyr, a fine-tuning version of Mistral 7B. Mistral 7B outperforms other open-source LLMs at a much smaller scale and is a worthwhile solution for use cases such as topic modeling. We want to keep inference as fast as possible and a relatively small model helps with that. Zephyr is a fine-tuned version of Mistral 7B that was trained on a mix of publicly available and synthetic datasets using Direct Preference Optimization (DPO).
To use Zephyr in BERTopic, we will first need to install and update a couple of packages that can handle quantized versions of Zephyr:
```python
pip install ctransformers[cuda]
pip install --upgrade git+https://github.com/huggingface/transformers
```
Instead of loading in the full model, we can instead load a quantized model which is a compressed version of the original model:
```python
from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
"TheBloke/zephyr-7B-alpha-GGUF",
model_file="zephyr-7b-alpha.Q4_K_M.gguf",
model_type="mistral",
gpu_layers=50,
hf=True
)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
# Pipeline
generator = pipeline(
model=model, tokenizer=tokenizer,
task='text-generation',
max_new_tokens=50,
repetition_penalty=1.1
)
```
This Zephyr model requires a specific prompt template in order to work:
```python
prompt = """<|system|>You are a helpful, respectful and honest assistant for labeling topics..</s>
<|user|>
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: '[KEYWORDS]'.
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.</s>
<|assistant|>"""
```
After creating this prompt template, we can create our representation model to be used in BERTopic:
```python
from bertopic.representation import TextGeneration
# Text generation with Zephyr
zephyr = TextGeneration(generator, prompt=prompt)
representation_model = {"Zephyr": zephyr}
# Topic Modeling
topic_model = BERTopic(representation_model=representation_model, verbose=True)
```
### **Llama (Manual Quantization)**
Full Llama Tutorial: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1QCERSMUjqGetGGujdrvv_6_EeoIcd_9M?usp=sharing)
Open-source LLMs are starting to become more and more popular. Here, we will go through a minimal example of using [Llama 2](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) together with BERTopic.
!!! Note
Although this is an example of the older Llama 2 model, you can use the code below for any Llama variant.
First, we need to load in our Llama model:
```python
from torch import bfloat16
import transformers
# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True, # 4-bit quantization
bnb_4bit_quant_type='nf4', # Normalized float 4
bnb_4bit_use_double_quant=True, # Second quantization after the first
bnb_4bit_compute_dtype=bfloat16 # Computation type
)
# Llama Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
# Llama Model
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
quantization_config=bnb_config,
device_map='auto',
)
model.eval()
# Our text generator
generator = transformers.pipeline(
model=model, tokenizer=tokenizer,
task='text-generation',
temperature=0.1,
max_new_tokens=500,
repetition_penalty=1.1
)
```
After doing so, we will need to define a prompt that works with both Llama as well as BERTopic:
```python
# System prompt describes information given to all conversations
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""
# Example prompt demonstrating the output we are looking for
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST] Environmental impacts of eating meat
"""
# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: '[KEYWORDS]'.
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""
prompt = system_prompt + example_prompt + main_prompt
```
Three pieces of the prompt were created:
* `system_prompt` helps us guide the model during a conversation. For example, we can say that it is a helpful assistant that is specialized in labeling topics.
* `example_prompt` gives an example of a correctly labeled topic to guide Llama
* `main_prompt` contains the main question we are going to ask it, namely to label a topic. Note that it uses the `[DOCUMENTS]` and `[KEYWORDS]` to provide the most relevant documents and keywords as additional context
After having generated our prompt template, we can start running our topic model:
```python
from bertopic.representation import TextGeneration
from bertopic import BERTopic
# Text generation with Llama
llama2 = TextGeneration(generator, prompt=prompt)
representation_model = {
"Llama2": llama2,
}
# Create our BERTopic model
topic_model = BERTopic(representation_model=representation_model, verbose=True)
```
## **llama.cpp**
An amazing framework for using LLMs for inference is [`llama.cpp`](https://github.com/ggerganov/llama.cpp) which has [python bindings](https://github.com/abetlen/llama-cpp-python) that we can use in BERTopic. To start with, we first need to install `llama-cpp-python`:
```bash
pip install llama-cpp-python
```
or using the following for hardware acceleration:
```bash
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
```
!!! Note
There are a number of [installation options](https://github.com/abetlen/llama-cpp-python#installation-with-hardware-acceleration) depending on your hardware and OS. Make sure that you select the correct one to optimize your performance.
After installation, you need to download your LLM locally before we use it in BERTopic, like so:
```bash
wget https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q4_K_M.gguf
```
Finally, we can now use the model with BERTopic in just a couple of lines:
```python
from bertopic import BERTopic
from bertopic.representation import LlamaCPP
# Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha
representation_model = LlamaCPP("zephyr-7b-alpha.Q4_K_M.gguf")
# Create our BERTopic model
topic_model = BERTopic(representation_model=representation_model, verbose=True)
```
If you want to have more control over the LLMs parameters, you can run it like so:
```python
from bertopic import BERTopic
from bertopic.representation import LlamaCPP
from llama_cpp import Llama
# Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha
llm = Llama(model_path="zephyr-7b-alpha.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop="Q:")
representation_model = LlamaCPP(llm)
# Create our BERTopic model
topic_model = BERTopic(representation_model=representation_model, verbose=True)
```
!!! Note
The default template that is being used uses a "Q: ... A: ... " type of structure which is why the `stop` is set at `"Q:"`.
The default template is:
```python
"""
Q: I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: '[KEYWORDS]'.
Based on the above information, can you give a short label of the topic?
A:
"""
```
## **OpenAI**
Instead of using a language model from 🤗 transformers, we can use external APIs instead that
do the work for you. Here, we can use [OpenAI](https://openai.com/api/) to extract our topic labels from the candidate documents and keywords.
To use this, you will need to install openai first:
```bash
pip install openai
```
Then, get yourself an API key and use OpenAI's API as follows:
```python
import openai
from bertopic.representation import OpenAI
from bertopic import BERTopic
# Create your representation model
client = openai.OpenAI(api_key="sk-...")
representation_model = OpenAI(client)
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
```
<br>
<div class="svg_image">
--8<-- "docs/getting_started/representation/openai.svg"
</div>
<br>
You can also use a custom prompt:
```python
prompt = "I have the following documents: [DOCUMENTS] \nThese documents are about the following topic: '"
representation_model = OpenAI(client, prompt=prompt)
```
### **GPT-4o**
To choose a specific model from OpenAI's offering:
```python
representation_model = OpenAI(client, model="gpt-4o-mini", delay_in_seconds=10)
```
Prompting with their models is very satisfying and is customizable as follows:
```python
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short topic label in the following format:
topic: <topic label>
"""
```
!!! note
Whenever you create a custom prompt, it is important to add
```
Based on the information above, extract a short topic label in the following format:
topic: <topic label>
```
at the end of your prompt as BERTopic extracts everything that comes after `topic: `. Having
said that, if `topic: ` is not in the output, then it will simply extract the entire response, so
feel free to experiment with the prompts.
### **Summarization**
Due to the structure of the prompts in OpenAI's chat models, we can extract different types of topic representations from their GPT models.
Instead of extracting a topic label, we can instead ask it to extract a short description of the topic instead:
```python
summarization_prompt = """
I have a topic that is described by the following keywords: [KEYWORDS]
In this topic, the following documents are a small but representative subset of all documents in the topic:
[DOCUMENTS]
Based on the information above, please give a description of this topic in the following format:
topic: <description>
"""
representation_model = OpenAI(client, model="gpt-4o-mini", prompt=summarization_prompt, nr_docs=5, delay_in_seconds=3)
```
The above is not constrained to just creating a short description or summary of the topic, we can extract labels, keywords, poems, example documents, extensitive descriptions, and more using this method!
If you want to have multiple representations of a single topic, it might be worthwhile to also check out [**multi-aspect**](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) topic modeling with BERTopic.
## **Ollama**
To use [Ollama](https://github.com/ollama/ollama) within BERTopic, it is advised to use the `openai` package as it allows to pass through a model using the url on which the model is running.
You will first need to install `openai`:
```bash
pip install openai
```
After installation, usage is straightforward and you can select any model that you have prepared in your `ollama` model list. You can see all models by running `ollama list`.
Select one from the list and you can use it in BERTopic as follows:
```python
import openai
from bertopic.representation import OpenAI
from bertopic import BERTopic
client = openai.OpenAI(
base_url = 'http://localhost:11434/v1', #wherever ollama is running
api_key='ollama', # required, but unused
)
# Create your representation model
representation_model = OpenAI(client, model='phi3:14b-medium-128k-instruct-q4_K_M')
# Create your BERTopic model
topic_model = BERTopic(representation_model=representation_model, verbose=True)
```
## **LiteLLM**
An amazing framework to simplify connecting to external LLMs, is [LiteLLM](https://docs.litellm.ai). This package allows you to connect to OpenAI, Cohere, Anthropic, etc. all within one package. This makes iteration and testing out different models a breeze!
o start with, we first need to install `litellm`:
```bash
pip install litellm
```
After installation, usage is straightforward and you can select any model found in their [docs](https://docs.litellm.ai/docs/providers).
Let's show an example with OpenAI:
```python
import os
from bertopic import BERTopic
from bertopic.representation import LiteLLM
# set ENV variables
os.environ["OPENAI_API_KEY"] = "MY_KEY"
# Create your representation model
representation_model = LiteLLM(model="gpt-4o-mini")
# Create our BERTopic model
topic_model = BERTopic(representation_model=representation_model, verbose=True)
```
## **LangChain**
[Langchain](https://github.com/hwchase17/langchain) is a package that helps users with chaining large language models.
In BERTopic, we can leverage this package in order to more efficiently combine external knowledge. Here, this
external knowledge are the most representative documents in each topic.
To use langchain, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain,
like openai:
```bash
pip install langchain, openai
```
Then, you can create your chain as follows:
```python
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type="stuff")
```
Finally, you can pass the chain to BERTopic as follows:
```python
from bertopic.representation import LangChain
# Create your representation model
representation_model = LangChain(chain)
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
```
You can also use a custom prompt:
```python
prompt = "What are these documents about? Please give a single label."
representation_model = LangChain(chain, prompt=prompt)
```
!!! note Note
The prompt does not make use of `[KEYWORDS]` and `[DOCUMENTS]` tags as
the documents are already used within langchain's `load_qa_chain`.
## **Cohere**
Instead of using a language model from 🤗 transformers, we can use external APIs instead that
do the work for you. Here, we can use [Cohere](https://docs.cohere.ai/) to extract our topic labels from the candidate documents and keywords.
To use this, you will need to install cohere first:
```bash
pip install cohere
```
Then, get yourself an API key and use Cohere's API as follows:
```python
import cohere
from bertopic.representation import Cohere
from bertopic import BERTopic
# Create your representation model
co = cohere.Client(my_api_key)
representation_model = Cohere(co)
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
```
<br>
<div class="svg_image">
--8<-- "docs/getting_started/representation/cohere.svg"
</div>
<br>
You can also use a custom prompt:
```python
prompt = """
I have topic that contains the following documents: [DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS].
Based on the above information, can you give a short label of the topic?
"""
representation_model = Cohere(co, prompt=prompt)
```
@@ -0,0 +1,56 @@
<svg width="523" height="43" viewBox="0 0 523 43" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect width="523" height="43" fill="white"/>
<path d="M27.3553 12.5217C27.5174 11.9093 27.5534 11.7291 28.8323 11.7291C29.1746 11.7291 29.3367 11.7291 29.3367 11.3869C29.3367 11.2068 29.2106 11.2068 28.8683 11.2068H26.7068C26.2565 11.2068 26.2385 11.2248 26.0404 11.513L19.4658 21.9062L18.1149 11.6031C18.0609 11.2068 18.0429 11.2068 17.5745 11.2068H15.341C14.9988 11.2068 14.8366 11.2068 14.8366 11.549C14.8366 11.7291 14.9988 11.7291 15.2689 11.7291C16.3677 11.7291 16.3677 11.8732 16.3677 12.0714C16.3677 12.1074 16.3677 12.2155 16.2956 12.4857L13.9901 21.672C13.7739 22.5366 13.3596 22.9329 12.1528 22.9869C12.0987 22.9869 11.8826 23.0049 11.8826 23.3112C11.8826 23.5093 12.0447 23.5093 12.1168 23.5093C12.477 23.5093 13.3956 23.4733 13.7559 23.4733H14.6205C14.8727 23.4733 15.1789 23.5093 15.4311 23.5093C15.5571 23.5093 15.7553 23.5093 15.7553 23.1671C15.7553 23.0049 15.5752 22.9869 15.5031 22.9869C14.9087 22.9689 14.3323 22.8608 14.3323 22.2124C14.3323 22.0323 14.3323 22.0143 14.4043 21.7621L16.8901 11.8372H16.9081L18.4031 23.023C18.4571 23.4553 18.4752 23.5093 18.6373 23.5093C18.8354 23.5093 18.9255 23.3652 19.0155 23.2031L26.2565 11.7472H26.2746L23.6627 22.1764C23.5006 22.8068 23.4646 22.9869 22.2037 22.9869C21.8615 22.9869 21.6814 22.9869 21.6814 23.3112C21.6814 23.5093 21.8435 23.5093 21.9516 23.5093C22.2578 23.5093 22.618 23.4733 22.9242 23.4733H25.0497C25.3559 23.4733 25.7342 23.5093 26.0404 23.5093C26.1845 23.5093 26.3826 23.5093 26.3826 23.1671C26.3826 22.9869 26.2205 22.9869 25.9503 22.9869C24.8516 22.9869 24.8516 22.8428 24.8516 22.6627C24.8516 22.6447 24.8516 22.5186 24.8876 22.3745L27.3553 12.5217Z" fill="black"/>
<path d="M46.2995 12.5217C46.4617 11.9093 46.4977 11.7291 47.7766 11.7291C48.1188 11.7291 48.2809 11.7291 48.2809 11.3869C48.2809 11.2068 48.1548 11.2068 47.8126 11.2068H45.6511C45.2008 11.2068 45.1828 11.2248 44.9846 11.513L38.4101 21.9062L37.0592 11.6031C37.0051 11.2068 36.9871 11.2068 36.5188 11.2068H34.2852C33.943 11.2068 33.7809 11.2068 33.7809 11.549C33.7809 11.7291 33.943 11.7291 34.2132 11.7291C35.3119 11.7291 35.3119 11.8732 35.3119 12.0714C35.3119 12.1074 35.3119 12.2155 35.2399 12.4857L32.9343 21.672C32.7182 22.5366 32.3039 22.9329 31.097 22.9869C31.043 22.9869 30.8268 23.0049 30.8268 23.3112C30.8268 23.5093 30.989 23.5093 31.061 23.5093C31.4213 23.5093 32.3399 23.4733 32.7001 23.4733H33.5647C33.8169 23.4733 34.1231 23.5093 34.3753 23.5093C34.5014 23.5093 34.6995 23.5093 34.6995 23.1671C34.6995 23.0049 34.5194 22.9869 34.4473 22.9869C33.8529 22.9689 33.2765 22.8608 33.2765 22.2124C33.2765 22.0323 33.2765 22.0143 33.3486 21.7621L35.8343 11.8372H35.8523L37.3474 23.023C37.4014 23.4553 37.4194 23.5093 37.5815 23.5093C37.7797 23.5093 37.8697 23.3652 37.9598 23.2031L45.2008 11.7472H45.2188L42.607 22.1764C42.4449 22.8068 42.4089 22.9869 41.148 22.9869C40.8057 22.9869 40.6256 22.9869 40.6256 23.3112C40.6256 23.5093 40.7877 23.5093 40.8958 23.5093C41.202 23.5093 41.5623 23.4733 41.8685 23.4733H43.9939C44.3002 23.4733 44.6784 23.5093 44.9846 23.5093C45.1287 23.5093 45.3269 23.5093 45.3269 23.1671C45.3269 22.9869 45.1648 22.9869 44.8946 22.9869C43.7958 22.9869 43.7958 22.8428 43.7958 22.6627C43.7958 22.6447 43.7958 22.5186 43.8318 22.3745L46.2995 12.5217Z" fill="black"/>
<path d="M55.517 12.4316C55.6792 11.7652 55.7512 11.7291 56.4537 11.7291H57.7506C59.2997 11.7291 60.4525 12.1975 60.4525 13.6024C60.4525 14.5211 59.9841 17.1689 56.3636 17.1689H54.3282L55.517 12.4316ZM58.0208 17.3851C60.2543 16.8987 62.0015 15.4577 62.0015 13.9087C62.0015 12.5037 60.5785 11.2068 58.0748 11.2068H53.1934C52.8332 11.2068 52.6711 11.2068 52.6711 11.549C52.6711 11.7291 52.7972 11.7291 53.1394 11.7291C54.2201 11.7291 54.2201 11.8732 54.2201 12.0714C54.2201 12.1074 54.2201 12.2155 54.1481 12.4857L51.7164 22.1764C51.5543 22.8068 51.5183 22.9869 50.2754 22.9869C49.8611 22.9869 49.735 22.9869 49.735 23.3292C49.735 23.5093 49.9332 23.5093 49.9872 23.5093C50.3114 23.5093 50.6897 23.4733 51.0319 23.4733H53.1574C53.4816 23.4733 53.8599 23.5093 54.1841 23.5093C54.3282 23.5093 54.5264 23.5093 54.5264 23.1671C54.5264 22.9869 54.3642 22.9869 54.0941 22.9869C52.9953 22.9869 52.9953 22.8428 52.9953 22.6627C52.9953 22.6447 52.9953 22.5186 53.0313 22.3745L54.2382 17.5292H56.3997C58.1108 17.5292 58.4351 18.6099 58.4351 19.2043C58.4351 19.4745 58.2549 20.177 58.1288 20.6453C57.9307 21.4739 57.8767 21.672 57.8767 22.0143C57.8767 23.2931 58.9214 23.8876 60.1282 23.8876C61.5872 23.8876 62.2177 22.1043 62.2177 21.8521C62.2177 21.7261 62.1276 21.672 62.0195 21.672C61.8754 21.672 61.8394 21.7801 61.8034 21.9242C61.3711 23.2031 60.6326 23.5273 60.1823 23.5273C59.732 23.5273 59.4438 23.3292 59.4438 22.5186C59.4438 22.0863 59.6599 20.4472 59.6779 20.3571C59.768 19.6907 59.768 19.6186 59.768 19.4745C59.768 18.1596 58.7052 17.6012 58.0208 17.3851Z" fill="black"/>
<path d="M79.6231 17.6733C79.8753 17.6733 80.1995 17.6733 80.1995 17.3491C80.1995 17.0068 79.8933 17.0068 79.6231 17.0068H69.0137C68.7616 17.0068 68.4373 17.0068 68.4373 17.3311C68.4373 17.6733 68.7436 17.6733 69.0137 17.6733H79.6231ZM79.6231 21.0236C79.8753 21.0236 80.1995 21.0236 80.1995 20.6994C80.1995 20.3572 79.8933 20.3572 79.6231 20.3572H69.0137C68.7616 20.3572 68.4373 20.3572 68.4373 20.6814C68.4373 21.0236 68.7436 21.0236 69.0137 21.0236H79.6231Z" fill="black"/>
<path d="M91.6075 21.3659C91.5174 21.6721 91.5174 21.7082 91.2652 22.0504C90.869 22.5547 90.0764 23.3293 89.2298 23.3293C88.4913 23.3293 88.077 22.6628 88.077 21.6001C88.077 20.6094 88.6354 18.592 88.9777 17.8355C89.5901 16.5746 90.4367 15.9262 91.1391 15.9262C92.328 15.9262 92.5621 17.4032 92.5621 17.5473C92.5621 17.5653 92.5081 17.7995 92.4901 17.8355L91.6075 21.3659ZM92.7603 16.7547C92.5621 16.2864 92.0758 15.5659 91.1391 15.5659C89.1037 15.5659 86.9062 18.1957 86.9062 20.8616C86.9062 22.6448 87.9509 23.6895 89.1758 23.6895C90.1665 23.6895 91.0131 22.915 91.5174 22.3206C91.6975 23.3833 92.5441 23.6895 93.0845 23.6895C93.6249 23.6895 94.0572 23.3653 94.3814 22.7169C94.6696 22.1044 94.9218 21.0057 94.9218 20.9336C94.9218 20.8436 94.8497 20.7715 94.7416 20.7715C94.5795 20.7715 94.5615 20.8616 94.4895 21.1318C94.2193 22.1945 93.877 23.3293 93.1385 23.3293C92.6162 23.3293 92.5801 22.861 92.5801 22.5007C92.5801 22.0864 92.6342 21.8883 92.7963 21.1858C92.9224 20.7355 93.0124 20.3392 93.1565 19.8169C93.823 17.115 93.9851 16.4665 93.9851 16.3585C93.9851 16.1063 93.787 15.9082 93.5168 15.9082C92.9404 15.9082 92.7963 16.5386 92.7603 16.7547Z" fill="black"/>
<path d="M102.451 16.1423C101.893 16.2504 101.604 16.6467 101.604 17.0429C101.604 17.4752 101.947 17.6193 102.199 17.6193C102.703 17.6193 103.117 17.187 103.117 16.6467C103.117 16.0703 102.559 15.5659 101.658 15.5659C100.938 15.5659 100.109 15.8901 99.3528 16.9889C99.2267 16.0342 98.5062 15.5659 97.7857 15.5659C97.0832 15.5659 96.723 16.1063 96.5068 16.5026C96.2006 17.151 95.9305 18.2318 95.9305 18.3218C95.9305 18.3939 96.0025 18.4839 96.1286 18.4839C96.2727 18.4839 96.2907 18.4659 96.3988 18.0516C96.669 16.9709 97.0112 15.9262 97.7317 15.9262C98.164 15.9262 98.2901 16.2324 98.2901 16.7547C98.2901 17.151 98.11 17.8535 97.9839 18.4119L97.4795 20.3572C97.4075 20.6995 97.2093 21.51 97.1193 21.8343C96.9932 22.3026 96.795 23.1492 96.795 23.2392C96.795 23.4914 96.9932 23.6895 97.2634 23.6895C97.4615 23.6895 97.8037 23.5634 97.9118 23.2032C97.9659 23.0591 98.6323 20.3392 98.7404 19.9249C98.8305 19.5287 98.9385 19.1504 99.0286 18.7541C99.1006 18.5019 99.1727 18.2137 99.2267 17.9796C99.2808 17.8175 99.7671 16.9349 100.217 16.5386C100.434 16.3405 100.902 15.9262 101.64 15.9262C101.929 15.9262 102.217 15.9802 102.451 16.1423Z" fill="black"/>
<path d="M109.97 21.2218C109.898 21.51 109.862 21.5821 109.628 21.8523C108.89 22.8069 108.133 23.1492 107.575 23.1492C106.98 23.1492 106.422 22.6808 106.422 21.438C106.422 20.4833 106.962 18.4659 107.359 17.6554C107.881 16.6467 108.691 15.9262 109.448 15.9262C110.637 15.9262 110.871 17.4032 110.871 17.5113L110.817 17.7634L109.97 21.2218ZM111.087 16.7547C110.853 16.2324 110.349 15.5659 109.448 15.5659C107.485 15.5659 105.251 18.0336 105.251 20.7175C105.251 22.5908 106.386 23.5094 107.539 23.5094C108.493 23.5094 109.34 22.7529 109.664 22.3926L109.268 24.0138C109.016 25.0044 108.908 25.4548 108.259 26.0852C107.521 26.8237 106.836 26.8237 106.44 26.8237C105.9 26.8237 105.449 26.7877 104.999 26.6436C105.575 26.4815 105.719 25.9771 105.719 25.779C105.719 25.4908 105.503 25.2026 105.107 25.2026C104.675 25.2026 104.206 25.5628 104.206 26.1572C104.206 26.8958 104.945 27.184 106.476 27.184C108.8 27.184 110.006 25.6889 110.241 24.7162L112.24 16.6467C112.294 16.4305 112.294 16.3945 112.294 16.3585C112.294 16.1063 112.096 15.9082 111.826 15.9082C111.393 15.9082 111.141 16.2684 111.087 16.7547Z" fill="black"/>
<path d="M123.446 18.2318C123.482 18.1237 123.932 17.2231 124.599 16.6467C125.067 16.2144 125.68 15.9262 126.382 15.9262C127.103 15.9262 127.355 16.4665 127.355 17.187C127.355 17.2951 127.355 17.6554 127.139 18.5019L126.688 20.3572C126.544 20.8976 126.202 22.2305 126.166 22.4287C126.094 22.6988 125.986 23.1672 125.986 23.2392C125.986 23.4914 126.184 23.6895 126.454 23.6895C126.994 23.6895 127.085 23.2752 127.247 22.6268L128.327 18.3218C128.363 18.1777 129.3 15.9262 131.281 15.9262C132.002 15.9262 132.254 16.4665 132.254 17.187C132.254 18.1957 131.552 20.1591 131.155 21.2398C130.993 21.6721 130.903 21.9063 130.903 22.2305C130.903 23.0411 131.462 23.6895 132.326 23.6895C134.001 23.6895 134.632 21.0417 134.632 20.9336C134.632 20.8436 134.56 20.7715 134.452 20.7715C134.29 20.7715 134.272 20.8256 134.181 21.1318C133.767 22.5728 133.101 23.3293 132.38 23.3293C132.2 23.3293 131.912 23.3113 131.912 22.7349C131.912 22.2666 132.128 21.6902 132.2 21.492C132.524 20.6274 133.335 18.5019 133.335 17.4572C133.335 16.3765 132.704 15.5659 131.335 15.5659C130.129 15.5659 129.156 16.2504 128.435 17.3131C128.381 16.3405 127.787 15.5659 126.436 15.5659C124.833 15.5659 123.986 16.7007 123.662 17.151C123.608 16.1243 122.87 15.5659 122.077 15.5659C121.555 15.5659 121.14 15.8181 120.798 16.5026C120.474 17.151 120.222 18.2498 120.222 18.3218C120.222 18.3939 120.294 18.4839 120.42 18.4839C120.564 18.4839 120.582 18.4659 120.69 18.0516C120.96 16.9889 121.303 15.9262 122.023 15.9262C122.437 15.9262 122.581 16.2144 122.581 16.7547C122.581 17.151 122.401 17.8535 122.275 18.4119L121.771 20.3572C121.699 20.6995 121.501 21.51 121.411 21.8343C121.285 22.3026 121.086 23.1492 121.086 23.2392C121.086 23.4914 121.285 23.6895 121.555 23.6895C121.771 23.6895 122.023 23.5815 122.167 23.3113C122.203 23.2212 122.365 22.5908 122.455 22.2305L122.852 20.6094L123.446 18.2318Z" fill="black"/>
<path d="M140.584 21.3659C140.494 21.6721 140.494 21.7082 140.242 22.0504C139.846 22.5547 139.053 23.3293 138.207 23.3293C137.468 23.3293 137.054 22.6628 137.054 21.6001C137.054 20.6094 137.612 18.592 137.955 17.8355C138.567 16.5746 139.414 15.9262 140.116 15.9262C141.305 15.9262 141.539 17.4032 141.539 17.5473C141.539 17.5653 141.485 17.7995 141.467 17.8355L140.584 21.3659ZM141.737 16.7547C141.539 16.2864 141.053 15.5659 140.116 15.5659C138.081 15.5659 135.883 18.1957 135.883 20.8616C135.883 22.6448 136.928 23.6895 138.153 23.6895C139.143 23.6895 139.99 22.915 140.494 22.3206C140.674 23.3833 141.521 23.6895 142.061 23.6895C142.602 23.6895 143.034 23.3653 143.358 22.7169C143.646 22.1044 143.899 21.0057 143.899 20.9336C143.899 20.8436 143.827 20.7715 143.719 20.7715C143.556 20.7715 143.538 20.8616 143.466 21.1318C143.196 22.1945 142.854 23.3293 142.115 23.3293C141.593 23.3293 141.557 22.861 141.557 22.5007C141.557 22.0864 141.611 21.8883 141.773 21.1858C141.899 20.7355 141.989 20.3392 142.133 19.8169C142.8 17.115 142.962 16.4665 142.962 16.3585C142.962 16.1063 142.764 15.9082 142.494 15.9082C141.917 15.9082 141.773 16.5386 141.737 16.7547Z" fill="black"/>
<path d="M152.959 16.1603C152.383 16.2684 152.166 16.7007 152.166 17.0429C152.166 17.4752 152.509 17.6193 152.761 17.6193C153.301 17.6193 153.679 17.151 153.679 16.6647C153.679 15.9082 152.815 15.5659 152.058 15.5659C150.96 15.5659 150.347 16.6467 150.185 16.9889C149.771 15.638 148.654 15.5659 148.33 15.5659C146.492 15.5659 145.52 17.9255 145.52 18.3218C145.52 18.3939 145.592 18.4839 145.718 18.4839C145.862 18.4839 145.898 18.3759 145.934 18.3038C146.546 16.3044 147.753 15.9262 148.276 15.9262C149.086 15.9262 149.248 16.6827 149.248 17.115C149.248 17.5113 149.14 17.9255 148.924 18.7901L148.312 21.2579C148.042 22.3386 147.519 23.3293 146.564 23.3293C146.474 23.3293 146.024 23.3293 145.646 23.0951C146.294 22.969 146.438 22.4287 146.438 22.2125C146.438 21.8523 146.168 21.6361 145.826 21.6361C145.394 21.6361 144.925 22.0144 144.925 22.5908C144.925 23.3473 145.772 23.6895 146.546 23.6895C147.411 23.6895 148.023 23.0051 148.402 22.2666C148.69 23.3293 149.591 23.6895 150.257 23.6895C152.094 23.6895 153.067 21.3299 153.067 20.9336C153.067 20.8436 152.995 20.7715 152.887 20.7715C152.725 20.7715 152.707 20.8616 152.653 21.0057C152.166 22.5908 151.122 23.3293 150.311 23.3293C149.681 23.3293 149.338 22.861 149.338 22.1225C149.338 21.7262 149.41 21.438 149.699 20.2492L150.329 17.7995C150.599 16.7187 151.212 15.9262 152.04 15.9262C152.076 15.9262 152.581 15.9262 152.959 16.1603Z" fill="black"/>
<path d="M114.979 34.5689C114.871 35.0252 114.847 35.1213 113.958 35.1213C113.73 35.1213 113.586 35.1213 113.586 35.3495C113.586 35.5176 113.742 35.5176 113.946 35.5176H118.077C120.611 35.5176 123.133 33.0919 123.133 30.4141C123.133 28.5768 121.908 27.3159 120.119 27.3159H115.916C115.7 27.3159 115.544 27.3159 115.544 27.5441C115.544 27.7122 115.688 27.7122 115.892 27.7122C116.288 27.7122 116.636 27.7122 116.636 27.9043C116.636 27.9524 116.624 27.9644 116.588 28.1205L114.979 34.5689ZM117.633 28.1565C117.741 27.7482 117.753 27.7122 118.245 27.7122H119.698C121.103 27.7122 122.052 28.5167 122.052 29.9818C122.052 30.378 121.884 32.3354 120.839 33.6803C120.311 34.3528 119.278 35.1213 117.861 35.1213H116.072C115.964 35.0973 115.928 35.0973 115.928 35.0132C115.928 34.9172 115.952 34.8211 115.976 34.7491L117.633 28.1565Z" fill="black"/>
<path d="M126.575 31.7284C126.575 31.5123 126.413 31.3682 126.206 31.3682C125.963 31.3682 125.702 31.6023 125.702 31.8635C125.702 32.0887 125.864 32.2238 126.062 32.2238C126.35 32.2238 126.575 31.9536 126.575 31.7284ZM126.035 34.8716C126.071 34.7815 126.152 34.5654 126.188 34.4843C126.215 34.4033 126.251 34.3132 126.251 34.1601C126.251 33.6557 125.801 33.3766 125.35 33.3766C124.432 33.3766 123.991 34.5654 123.991 34.7635C123.991 34.8085 124.027 34.8896 124.153 34.8896C124.279 34.8896 124.306 34.8356 124.333 34.7545C124.585 33.8809 125.062 33.6738 125.314 33.6738C125.495 33.6738 125.558 33.7908 125.558 33.998C125.558 34.1781 125.504 34.3132 125.486 34.3762L125.017 35.529C124.909 35.7902 124.909 35.8082 124.792 36.1144C124.675 36.3846 124.63 36.5017 124.63 36.6548C124.63 37.1141 125.044 37.4384 125.54 37.4384C126.449 37.4384 126.9 36.2495 126.9 36.0514C126.9 36.0334 126.891 35.9253 126.728 35.9253C126.602 35.9253 126.593 35.9703 126.539 36.1415C126.395 36.6008 126.026 37.1412 125.567 37.1412C125.405 37.1412 125.323 37.0331 125.323 36.8169C125.323 36.6368 125.377 36.5017 125.495 36.2225L126.035 34.8716Z" fill="black"/>
<path d="M135.182 32.7918C135.374 32.7918 135.638 32.7918 135.638 32.5156C135.638 32.2394 135.374 32.2394 135.182 32.2394H129.826C129.982 30.5703 131.375 29.2974 133.2 29.2974H135.182C135.374 29.2974 135.638 29.2974 135.638 29.0212C135.638 28.745 135.374 28.745 135.182 28.745H133.164C131.003 28.745 129.262 30.4262 129.262 32.5156C129.262 34.6171 131.015 36.2862 133.164 36.2862H135.182C135.374 36.2862 135.638 36.2862 135.638 36.01C135.638 35.7338 135.374 35.7338 135.182 35.7338H133.2C131.375 35.7338 129.982 34.4609 129.826 32.7918H135.182Z" fill="black"/>
<path d="M141.37 28.1565C141.478 27.7482 141.49 27.7122 141.983 27.7122H142.955C143.94 27.7122 144.865 27.9644 144.865 28.937C144.865 29.4654 144.588 30.306 144.036 30.7023C143.46 31.1105 142.775 31.2426 142.079 31.2426H140.602L141.37 28.1565ZM143.412 31.4348C144.889 31.0625 146.041 30.1619 146.041 29.1412C146.041 28.0965 144.865 27.3159 143.195 27.3159H139.653C139.437 27.3159 139.281 27.3159 139.281 27.5441C139.281 27.7122 139.437 27.7122 139.629 27.7122C140.025 27.7122 140.373 27.7122 140.373 27.9043C140.373 27.9524 140.361 27.9644 140.325 28.1205L138.716 34.5689C138.608 35.0252 138.584 35.1213 137.696 35.1213C137.467 35.1213 137.323 35.1213 137.323 35.3495C137.323 35.3975 137.359 35.5176 137.515 35.5176C137.744 35.5176 138.02 35.4936 138.26 35.4816H139.004C140.121 35.4816 140.469 35.5176 140.542 35.5176C140.614 35.5176 140.782 35.5176 140.782 35.2894C140.782 35.1213 140.626 35.1213 140.421 35.1213C140.385 35.1213 140.169 35.1213 139.965 35.0973C139.713 35.0733 139.689 35.0373 139.689 34.9292C139.689 34.8691 139.713 34.7971 139.725 34.737L140.506 31.5789H142.067C143.147 31.5789 143.484 32.1553 143.484 32.6476C143.484 32.8157 143.4 33.152 143.339 33.4041C143.243 33.7524 143.123 34.2327 143.123 34.4488C143.123 35.3735 143.916 35.7698 144.804 35.7698C145.849 35.7698 146.342 34.5929 146.342 34.3648C146.342 34.3168 146.306 34.1967 146.149 34.1967C146.017 34.1967 145.981 34.3048 145.969 34.3528C145.705 35.1814 145.189 35.4335 144.84 35.4335C144.396 35.4335 144.348 35.0973 144.348 34.701C144.348 34.3047 144.42 33.7764 144.468 33.3801C144.516 33.0199 144.516 32.9478 144.516 32.8277C144.516 32.1072 144.048 31.6869 143.412 31.4348Z" fill="black"/>
<path d="M151.462 38.2196C151.546 38.4357 151.63 38.5198 151.775 38.5198C151.931 38.5198 152.051 38.3997 152.051 38.2436C152.051 38.2076 152.051 38.1835 151.979 38.0154L147.692 26.8117C147.608 26.5955 147.524 26.5115 147.379 26.5115C147.223 26.5115 147.103 26.6316 147.103 26.7877C147.103 26.8237 147.103 26.8477 147.175 27.0158L151.462 38.2196Z" fill="black"/>
<path d="M160.832 27.3881C160.844 27.34 160.868 27.268 160.868 27.2079C160.868 27.1239 160.796 27.0638 160.712 27.0638C160.628 27.0638 160.604 27.0879 160.472 27.232C160.339 27.3881 160.027 27.7723 159.895 27.9164C159.427 27.232 158.67 27.0638 158.058 27.0638C156.389 27.0638 154.96 28.4688 154.96 29.8378C154.96 30.5342 155.332 30.9425 155.392 31.0266C155.788 31.4469 156.137 31.5429 157.001 31.7471C157.421 31.8552 157.445 31.8552 157.794 31.9392C158.142 32.0233 158.911 32.2154 158.911 33.2121C158.911 34.2568 157.878 35.3736 156.617 35.3736C155.836 35.3736 154.407 35.1334 154.407 33.6444C154.407 33.6084 154.407 33.3562 154.479 33.068L154.491 32.9479C154.491 32.8038 154.359 32.7918 154.311 32.7918C154.155 32.7918 154.143 32.8398 154.083 33.116L153.591 35.0734C153.543 35.2535 153.459 35.5777 153.459 35.6138C153.459 35.7098 153.531 35.7699 153.615 35.7699C153.699 35.7699 153.711 35.7579 153.843 35.6017L154.419 34.9293C154.696 35.2895 155.38 35.7699 156.593 35.7699C158.334 35.7699 159.763 34.1968 159.763 32.7558C159.763 32.2034 159.583 31.7711 159.259 31.4349C158.899 31.0386 158.502 30.9425 157.938 30.7984C157.59 30.7144 157.121 30.6063 156.845 30.5342C156.485 30.4502 155.812 30.21 155.812 29.3694C155.812 28.4328 156.821 27.4241 158.046 27.4241C159.127 27.4241 159.871 27.9885 159.871 29.2854C159.871 29.5736 159.823 29.8258 159.823 29.8738C159.823 30.0179 159.931 30.0419 160.015 30.0419C160.159 30.0419 160.171 29.9939 160.219 29.8017L160.832 27.3881Z" fill="black"/>
<path d="M165.708 28.0125V27.346H163.961V10.6665H165.708V10H163.295V28.0125H165.708Z" fill="black"/>
<path d="M171.67 12.2876C171.22 11.0087 169.797 11.0087 169.563 11.0087C169.455 11.0087 169.257 11.0087 169.257 11.1888C169.257 11.3329 169.365 11.3509 169.455 11.3689C169.725 11.4049 169.941 11.441 170.229 11.9633C170.409 12.3056 172.265 17.6913 172.265 17.7273C172.265 17.7453 172.247 17.7633 172.103 17.9074L167.419 22.6447C167.203 22.8609 167.059 23.005 167.059 23.2391C167.059 23.4913 167.275 23.7074 167.563 23.7074C167.635 23.7074 167.834 23.6714 167.942 23.5633C168.23 23.2932 170.806 20.141 172.445 18.1956C172.913 19.6006 173.489 21.2578 174.048 22.7708C174.138 23.041 174.228 23.2932 174.48 23.5273C174.66 23.6894 174.696 23.6894 175.201 23.6894H175.543C175.615 23.6894 175.741 23.6894 175.741 23.5453C175.741 23.4733 175.723 23.4553 175.651 23.3832C175.489 23.1851 175.363 22.8609 175.291 22.6447L171.67 12.2876Z" fill="black"/>
<path d="M187.83 10.9907C187.83 10.8286 187.704 10.8286 187.668 10.8286C187.596 10.8286 187.578 10.8466 187.362 11.1168C187.254 11.2429 186.516 12.1795 186.498 12.1976C185.903 11.0267 184.714 10.8286 183.958 10.8286C181.67 10.8286 179.599 12.9181 179.599 14.9535C179.599 16.3044 180.409 17.0969 181.292 17.4032C181.49 17.4752 182.553 17.7634 183.093 17.8895C184.012 18.1417 184.246 18.2137 184.624 18.61C184.696 18.7001 185.057 19.1143 185.057 19.9609C185.057 21.6361 183.508 23.3653 181.706 23.3653C180.229 23.3653 178.59 22.7348 178.59 20.7174C178.59 20.3752 178.662 19.9429 178.716 19.7628C178.716 19.7088 178.734 19.6187 178.734 19.5827C178.734 19.5106 178.698 19.4206 178.554 19.4206C178.392 19.4206 178.374 19.4566 178.302 19.7628L177.383 23.4553C177.383 23.4734 177.311 23.7075 177.311 23.7255C177.311 23.8876 177.455 23.8876 177.491 23.8876C177.563 23.8876 177.581 23.8696 177.798 23.5994L178.626 22.5187C179.058 23.1671 179.995 23.8876 181.67 23.8876C183.994 23.8876 186.119 21.6361 186.119 19.3845C186.119 18.628 185.939 17.9615 185.255 17.2951C184.876 16.9168 184.552 16.8268 182.895 16.3945C181.688 16.0702 181.526 16.0162 181.202 15.728C180.896 15.4218 180.662 14.9895 180.662 14.3771C180.662 12.864 182.193 11.3149 183.904 11.3149C185.669 11.3149 186.498 12.3957 186.498 14.1069C186.498 14.5752 186.408 15.0615 186.408 15.1336C186.408 15.2957 186.552 15.2957 186.606 15.2957C186.768 15.2957 186.786 15.2417 186.858 14.9535L187.83 10.9907Z" fill="black"/>
<path d="M193.386 20.9336C193.386 20.8435 193.314 20.7715 193.206 20.7715C193.044 20.7715 193.026 20.8255 192.935 21.1317C192.467 22.7708 191.729 23.3292 191.134 23.3292C190.918 23.3292 190.666 23.2752 190.666 22.7348C190.666 22.2485 190.882 21.7081 191.08 21.1677L192.341 17.8174C192.395 17.6733 192.521 17.3491 192.521 17.0069C192.521 16.2503 191.981 15.5659 191.098 15.5659C189.441 15.5659 188.775 18.1777 188.775 18.3218C188.775 18.3938 188.847 18.4839 188.973 18.4839C189.135 18.4839 189.153 18.4118 189.225 18.1597C189.657 16.6466 190.342 15.9261 191.044 15.9261C191.206 15.9261 191.512 15.9441 191.512 16.5205C191.512 16.9888 191.278 17.5833 191.134 17.9795L189.873 21.3298C189.765 21.618 189.657 21.9062 189.657 22.2305C189.657 23.041 190.216 23.6895 191.08 23.6895C192.737 23.6895 193.386 21.0597 193.386 20.9336ZM193.242 12.2696C193.242 11.9994 193.026 11.6752 192.629 11.6752C192.215 11.6752 191.747 12.0714 191.747 12.5398C191.747 12.9901 192.125 13.1342 192.341 13.1342C192.827 13.1342 193.242 12.6659 193.242 12.2696Z" fill="black"/>
<path d="M198.016 18.2318C198.052 18.1237 198.502 17.2231 199.168 16.6467C199.637 16.2144 200.249 15.9262 200.952 15.9262C201.672 15.9262 201.924 16.4665 201.924 17.187C201.924 17.2951 201.924 17.6554 201.708 18.5019L201.258 20.3572C201.114 20.8976 200.771 22.2305 200.735 22.4287C200.663 22.6988 200.555 23.1672 200.555 23.2392C200.555 23.4914 200.753 23.6895 201.024 23.6895C201.564 23.6895 201.654 23.2752 201.816 22.6268L202.897 18.3218C202.933 18.1777 203.87 15.9262 205.851 15.9262C206.571 15.9262 206.824 16.4665 206.824 17.187C206.824 18.1957 206.121 20.1591 205.725 21.2398C205.563 21.6721 205.473 21.9063 205.473 22.2305C205.473 23.0411 206.031 23.6895 206.896 23.6895C208.571 23.6895 209.201 21.0417 209.201 20.9336C209.201 20.8436 209.129 20.7715 209.021 20.7715C208.859 20.7715 208.841 20.8256 208.751 21.1318C208.337 22.5728 207.67 23.3293 206.95 23.3293C206.77 23.3293 206.481 23.3113 206.481 22.7349C206.481 22.2666 206.698 21.6902 206.77 21.492C207.094 20.6274 207.904 18.5019 207.904 17.4572C207.904 16.3765 207.274 15.5659 205.905 15.5659C204.698 15.5659 203.725 16.2504 203.005 17.3131C202.951 16.3405 202.357 15.5659 201.006 15.5659C199.403 15.5659 198.556 16.7007 198.232 17.151C198.178 16.1243 197.439 15.5659 196.647 15.5659C196.124 15.5659 195.71 15.8181 195.368 16.5026C195.043 17.151 194.791 18.2498 194.791 18.3218C194.791 18.3939 194.863 18.4839 194.989 18.4839C195.134 18.4839 195.152 18.4659 195.26 18.0516C195.53 16.9889 195.872 15.9262 196.593 15.9262C197.007 15.9262 197.151 16.2144 197.151 16.7547C197.151 17.151 196.971 17.8535 196.845 18.4119L196.34 20.3572C196.268 20.6995 196.07 21.51 195.98 21.8343C195.854 22.3026 195.656 23.1492 195.656 23.2392C195.656 23.4914 195.854 23.6895 196.124 23.6895C196.34 23.6895 196.593 23.5815 196.737 23.3113C196.773 23.2212 196.935 22.5908 197.025 22.2305L197.421 20.6094L198.016 18.2318Z" fill="black"/>
<path d="M213.503 18.5619C213.503 18.2377 213.479 18.2257 213.155 18.2257C212.662 18.706 212.026 18.9942 210.885 18.9942V19.3905C211.209 19.3905 211.858 19.3905 212.554 19.0663V25.2265C212.554 25.6709 212.518 25.815 211.377 25.815H210.957V26.2112C211.449 26.1752 212.482 26.1752 213.022 26.1752C213.563 26.1752 214.608 26.1752 215.1 26.2112V25.815H214.68C213.539 25.815 213.503 25.6709 213.503 25.2265V18.5619Z" fill="black"/>
<path d="M222.716 27.8864C222.716 27.8323 222.716 27.7963 222.41 27.4901C220.609 25.6708 219.6 22.6988 219.6 19.0242C219.6 15.5298 220.447 12.5217 222.536 10.3963C222.716 10.2342 222.716 10.1981 222.716 10.1441C222.716 10.036 222.626 10 222.554 10C222.32 10 220.843 11.2969 219.96 13.0621C219.042 14.8814 218.627 16.8087 218.627 19.0242C218.627 20.6273 218.88 22.7708 219.816 24.6982C220.879 26.8597 222.356 28.0305 222.554 28.0305C222.626 28.0305 222.716 27.9944 222.716 27.8864Z" fill="black"/>
<path d="M226.549 22.1764C226.387 22.8068 226.351 22.9869 225.09 22.9869C224.748 22.9869 224.568 22.9869 224.568 23.3112C224.568 23.5093 224.676 23.5093 225.036 23.5093H230.746C234.384 23.5093 237.933 19.7447 237.933 15.7279C237.933 13.1341 236.384 11.2068 233.808 11.2068H228.026C227.684 11.2068 227.522 11.2068 227.522 11.549C227.522 11.7291 227.684 11.7291 227.954 11.7291C229.053 11.7291 229.053 11.8732 229.053 12.0714C229.053 12.1074 229.053 12.2155 228.981 12.4857L226.549 22.1764ZM230.35 12.4316C230.512 11.7652 230.584 11.7291 231.286 11.7291H233.268C234.961 11.7291 236.51 12.6478 236.51 15.1335C236.51 16.0341 236.15 19.1683 234.402 21.1497C233.898 21.7441 232.529 22.9869 230.458 22.9869H228.404C228.152 22.9869 228.116 22.9869 228.008 22.9689C227.81 22.9509 227.792 22.9149 227.792 22.7708C227.792 22.6447 227.828 22.5366 227.864 22.3745L230.35 12.4316Z" fill="black"/>
<path d="M241.943 18.718C241.943 18.4539 241.751 18.2617 241.475 18.2617C241.163 18.2617 240.814 18.5499 240.814 18.9102C240.814 19.1743 241.007 19.3665 241.283 19.3665C241.595 19.3665 241.943 19.0783 241.943 18.718ZM240.19 23.1251L239.542 24.7822C239.482 24.9624 239.421 25.1065 239.421 25.3106C239.421 25.899 239.878 26.3313 240.514 26.3313C241.679 26.3313 242.171 24.6501 242.171 24.494C242.171 24.3739 242.075 24.3379 241.991 24.3379C241.847 24.3379 241.823 24.422 241.787 24.5421C241.511 25.5027 241.019 25.9951 240.538 25.9951C240.394 25.9951 240.25 25.935 240.25 25.6108C240.25 25.3226 240.334 25.1065 240.49 24.7342C240.61 24.41 240.73 24.0857 240.862 23.7615L241.235 22.7888C241.343 22.5127 241.487 22.1404 241.487 21.9363C241.487 21.3358 241.007 20.9156 240.394 20.9156C239.229 20.9156 238.725 22.5967 238.725 22.7528C238.725 22.8609 238.809 22.9089 238.905 22.9089C239.061 22.9089 239.073 22.8369 239.109 22.7168C239.446 21.576 239.998 21.2518 240.358 21.2518C240.526 21.2518 240.646 21.3118 240.646 21.6481C240.646 21.7681 240.634 21.9363 240.514 22.2965L240.19 23.1251Z" fill="black"/>
<path d="M246.972 23.5813C246.972 22.5366 246.629 21.7621 245.891 21.7621C245.315 21.7621 245.026 22.2304 245.026 22.6267C245.026 23.023 245.296 23.5093 245.909 23.5093C246.143 23.5093 246.341 23.4372 246.503 23.2751C246.539 23.2391 246.557 23.2391 246.575 23.2391C246.611 23.2391 246.611 23.4913 246.611 23.5813C246.611 24.1758 246.503 25.3466 245.459 26.5174C245.26 26.7335 245.26 26.7695 245.26 26.8056C245.26 26.8956 245.351 26.9857 245.441 26.9857C245.585 26.9857 246.972 25.6528 246.972 23.5813Z" fill="black"/>
<path d="M259.034 23.4013C262.132 22.1584 264.437 18.8802 264.437 15.6199C264.437 12.6479 262.492 10.8286 259.97 10.8286C256.116 10.8286 252.225 14.9715 252.225 19.1324C252.225 21.9603 254.098 23.8876 256.71 23.8876C257.323 23.8876 257.917 23.7976 258.493 23.6175C258.385 24.6982 258.385 24.7883 258.385 25.1305C258.385 25.6529 258.385 27.0038 259.826 27.0038C261.97 27.0038 262.762 23.6355 262.762 23.5274C262.762 23.4193 262.69 23.3473 262.6 23.3473C262.492 23.3473 262.456 23.4373 262.402 23.6355C262.006 24.7703 261.069 25.4187 260.277 25.4187C259.358 25.4187 259.124 24.7883 259.034 23.4013ZM255.791 23.3112C254.368 22.8249 253.684 21.3299 253.684 19.6727C253.684 18.4119 254.152 16.0162 255.341 14.1969C256.674 12.1435 258.457 11.2249 259.862 11.2249C261.736 11.2249 262.996 12.7379 262.996 15.0796C262.996 16.4125 262.366 20.8796 258.98 22.8609C258.89 21.9423 258.637 20.8615 257.449 20.8615C256.476 20.8615 255.647 21.8342 255.647 22.7168C255.647 22.915 255.719 23.1852 255.791 23.3112ZM258.529 23.0951C257.881 23.3833 257.323 23.4914 256.818 23.4914C256.638 23.4914 256.008 23.4914 256.008 22.6988C256.008 22.0504 256.638 21.2218 257.449 21.2218C258.349 21.2218 258.547 21.8342 258.547 22.7348C258.547 22.8429 258.547 22.987 258.529 23.0951Z" fill="black"/>
<path d="M270.409 19.0242C270.409 17.6553 270.228 15.4217 269.22 13.3323C268.157 11.1708 266.68 10 266.482 10C266.41 10 266.32 10.036 266.32 10.1441C266.32 10.1981 266.32 10.2342 266.626 10.5404C268.427 12.3596 269.436 15.3317 269.436 19.0062C269.436 22.5006 268.589 25.5087 266.5 27.6342C266.32 27.7963 266.32 27.8323 266.32 27.8864C266.32 27.9944 266.41 28.0305 266.482 28.0305C266.716 28.0305 268.193 26.7336 269.076 24.9684C269.994 23.1311 270.409 21.1857 270.409 19.0242Z" fill="black"/>
<path d="M288.061 19.3665C288.367 19.3665 288.691 19.3665 288.691 19.0062C288.691 18.646 288.367 18.646 288.061 18.646H278.316C278.01 18.646 277.686 18.646 277.686 19.0062C277.686 19.3665 278.01 19.3665 278.316 19.3665H288.061Z" fill="black"/>
<path d="M300.057 27.8864C300.057 27.8323 300.057 27.7963 299.751 27.4901C297.95 25.6708 296.941 22.6988 296.941 19.0242C296.941 15.5298 297.788 12.5217 299.877 10.3963C300.057 10.2342 300.057 10.1981 300.057 10.1441C300.057 10.036 299.967 10 299.895 10C299.661 10 298.184 11.2969 297.301 13.0621C296.383 14.8814 295.968 16.8087 295.968 19.0242C295.968 20.6273 296.22 22.7708 297.157 24.6982C298.22 26.8597 299.697 28.0305 299.895 28.0305C299.967 28.0305 300.057 27.9944 300.057 27.8864Z" fill="black"/>
<path d="M306.25 11.9633C306.25 11.549 306.25 11.531 305.889 11.531C305.457 12.0173 304.556 12.6838 302.701 12.6838V13.2062C303.115 13.2062 304.016 13.2062 305.007 12.7378V22.1223C305.007 22.7708 304.953 22.9869 303.368 22.9869H302.809V23.5093C303.295 23.4733 305.043 23.4733 305.637 23.4733C306.232 23.4733 307.961 23.4733 308.447 23.5093V22.9869H307.889C306.304 22.9869 306.25 22.7708 306.25 22.1223V11.9633Z" fill="black"/>
<path d="M325.753 19.3665C326.06 19.3665 326.384 19.3665 326.384 19.0062C326.384 18.646 326.06 18.646 325.753 18.646H316.009C315.702 18.646 315.378 18.646 315.378 19.0062C315.378 19.3665 315.702 19.3665 316.009 19.3665H325.753Z" fill="black"/>
<path d="M337.461 12.2876C337.011 11.0087 335.588 11.0087 335.354 11.0087C335.246 11.0087 335.048 11.0087 335.048 11.1888C335.048 11.3329 335.156 11.3509 335.246 11.3689C335.516 11.4049 335.732 11.441 336.02 11.9633C336.201 12.3056 338.056 17.6913 338.056 17.7273C338.056 17.7453 338.038 17.7633 337.894 17.9074L333.21 22.6447C332.994 22.8609 332.85 23.005 332.85 23.2391C332.85 23.4913 333.066 23.7074 333.355 23.7074C333.427 23.7074 333.625 23.6714 333.733 23.5633C334.021 23.2932 336.597 20.141 338.236 18.1956C338.704 19.6006 339.281 21.2578 339.839 22.7708C339.929 23.041 340.019 23.2932 340.271 23.5273C340.451 23.6894 340.487 23.6894 340.992 23.6894H341.334C341.406 23.6894 341.532 23.6894 341.532 23.5453C341.532 23.4733 341.514 23.4553 341.442 23.3832C341.28 23.1851 341.154 22.8609 341.082 22.6447L337.461 12.2876Z" fill="black"/>
<path d="M347.263 19.0242C347.263 17.6553 347.083 15.4217 346.075 13.3323C345.012 11.1708 343.535 10 343.337 10C343.265 10 343.175 10.036 343.175 10.1441C343.175 10.1981 343.175 10.2342 343.481 10.5404C345.282 12.3596 346.291 15.3317 346.291 19.0062C346.291 22.5006 345.444 25.5087 343.355 27.6342C343.175 27.7963 343.175 27.8323 343.175 27.8864C343.175 27.9944 343.265 28.0305 343.337 28.0305C343.571 28.0305 345.048 26.7336 345.93 24.9684C346.849 23.1311 347.263 21.1857 347.263 19.0242Z" fill="black"/>
<path d="M358.757 18.2318C358.793 18.1237 359.244 17.2231 359.91 16.6467C360.378 16.2144 360.991 15.9262 361.693 15.9262C362.414 15.9262 362.666 16.4665 362.666 17.187C362.666 17.2951 362.666 17.6554 362.45 18.5019L362 20.3572C361.855 20.8976 361.513 22.2305 361.477 22.4287C361.405 22.6988 361.297 23.1672 361.297 23.2392C361.297 23.4914 361.495 23.6895 361.765 23.6895C362.306 23.6895 362.396 23.2752 362.558 22.6268L363.639 18.3218C363.675 18.1777 364.611 15.9262 366.593 15.9262C367.313 15.9262 367.565 16.4665 367.565 17.187C367.565 18.1957 366.863 20.1591 366.467 21.2398C366.304 21.6721 366.214 21.9063 366.214 22.2305C366.214 23.0411 366.773 23.6895 367.637 23.6895C369.313 23.6895 369.943 21.0417 369.943 20.9336C369.943 20.8436 369.871 20.7715 369.763 20.7715C369.601 20.7715 369.583 20.8256 369.493 21.1318C369.078 22.5728 368.412 23.3293 367.691 23.3293C367.511 23.3293 367.223 23.3113 367.223 22.7349C367.223 22.2666 367.439 21.6902 367.511 21.492C367.836 20.6274 368.646 18.5019 368.646 17.4572C368.646 16.3765 368.016 15.5659 366.647 15.5659C365.44 15.5659 364.467 16.2504 363.747 17.3131C363.693 16.3405 363.098 15.5659 361.747 15.5659C360.144 15.5659 359.298 16.7007 358.973 17.151C358.919 16.1243 358.181 15.5659 357.388 15.5659C356.866 15.5659 356.452 15.8181 356.109 16.5026C355.785 17.151 355.533 18.2498 355.533 18.3218C355.533 18.3939 355.605 18.4839 355.731 18.4839C355.875 18.4839 355.893 18.4659 356.001 18.0516C356.272 16.9889 356.614 15.9262 357.334 15.9262C357.749 15.9262 357.893 16.2144 357.893 16.7547C357.893 17.151 357.713 17.8535 357.586 18.4119L357.082 20.3572C357.01 20.6995 356.812 21.51 356.722 21.8343C356.596 22.3026 356.398 23.1492 356.398 23.2392C356.398 23.4914 356.596 23.6895 356.866 23.6895C357.082 23.6895 357.334 23.5815 357.478 23.3113C357.514 23.2212 357.677 22.5908 357.767 22.2305L358.163 20.6094L358.757 18.2318Z" fill="black"/>
<path d="M375.896 21.3659C375.806 21.6721 375.806 21.7082 375.553 22.0504C375.157 22.5547 374.365 23.3293 373.518 23.3293C372.779 23.3293 372.365 22.6628 372.365 21.6001C372.365 20.6094 372.924 18.592 373.266 17.8355C373.878 16.5746 374.725 15.9262 375.427 15.9262C376.616 15.9262 376.85 17.4032 376.85 17.5473C376.85 17.5653 376.796 17.7995 376.778 17.8355L375.896 21.3659ZM377.048 16.7547C376.85 16.2864 376.364 15.5659 375.427 15.5659C373.392 15.5659 371.194 18.1957 371.194 20.8616C371.194 22.6448 372.239 23.6895 373.464 23.6895C374.455 23.6895 375.301 22.915 375.806 22.3206C375.986 23.3833 376.832 23.6895 377.373 23.6895C377.913 23.6895 378.345 23.3653 378.67 22.7169C378.958 22.1044 379.21 21.0057 379.21 20.9336C379.21 20.8436 379.138 20.7715 379.03 20.7715C378.868 20.7715 378.85 20.8616 378.778 21.1318C378.507 22.1945 378.165 23.3293 377.427 23.3293C376.904 23.3293 376.868 22.861 376.868 22.5007C376.868 22.0864 376.922 21.8883 377.084 21.1858C377.211 20.7355 377.301 20.3392 377.445 19.8169C378.111 17.115 378.273 16.4665 378.273 16.3585C378.273 16.1063 378.075 15.9082 377.805 15.9082C377.229 15.9082 377.084 16.5386 377.048 16.7547Z" fill="black"/>
<path d="M388.27 16.1603C387.694 16.2684 387.478 16.7007 387.478 17.0429C387.478 17.4752 387.82 17.6193 388.072 17.6193C388.612 17.6193 388.991 17.151 388.991 16.6647C388.991 15.9082 388.126 15.5659 387.37 15.5659C386.271 15.5659 385.658 16.6467 385.496 16.9889C385.082 15.638 383.965 15.5659 383.641 15.5659C381.804 15.5659 380.831 17.9255 380.831 18.3218C380.831 18.3939 380.903 18.4839 381.029 18.4839C381.173 18.4839 381.209 18.3759 381.245 18.3038C381.858 16.3044 383.065 15.9262 383.587 15.9262C384.397 15.9262 384.56 16.6827 384.56 17.115C384.56 17.5113 384.451 17.9255 384.235 18.7901L383.623 21.2579C383.353 22.3386 382.83 23.3293 381.876 23.3293C381.786 23.3293 381.335 23.3293 380.957 23.0951C381.606 22.969 381.75 22.4287 381.75 22.2125C381.75 21.8523 381.479 21.6361 381.137 21.6361C380.705 21.6361 380.237 22.0144 380.237 22.5908C380.237 23.3473 381.083 23.6895 381.858 23.6895C382.722 23.6895 383.335 23.0051 383.713 22.2666C384.001 23.3293 384.902 23.6895 385.568 23.6895C387.406 23.6895 388.378 21.3299 388.378 20.9336C388.378 20.8436 388.306 20.7715 388.198 20.7715C388.036 20.7715 388.018 20.8616 387.964 21.0057C387.478 22.5908 386.433 23.3293 385.622 23.3293C384.992 23.3293 384.65 22.861 384.65 22.1225C384.65 21.7262 384.722 21.438 385.01 20.2492L385.64 17.7995C385.91 16.7187 386.523 15.9262 387.351 15.9262C387.388 15.9262 387.892 15.9262 388.27 16.1603Z" fill="black"/>
<path d="M357.764 33.7684C357.656 34.2247 357.632 34.3208 356.743 34.3208C356.515 34.3208 356.371 34.3208 356.371 34.5489C356.371 34.7171 356.527 34.7171 356.731 34.7171H360.862C363.396 34.7171 365.918 32.2914 365.918 29.6135C365.918 27.7763 364.693 26.5154 362.903 26.5154H358.701C358.484 26.5154 358.328 26.5154 358.328 26.7435C358.328 26.9117 358.472 26.9117 358.677 26.9117C359.073 26.9117 359.421 26.9117 359.421 27.1038C359.421 27.1518 359.409 27.1638 359.373 27.3199L357.764 33.7684ZM360.418 27.356C360.526 26.9477 360.538 26.9117 361.03 26.9117H362.483C363.888 26.9117 364.837 27.7162 364.837 29.1812C364.837 29.5775 364.669 31.5349 363.624 32.8798C363.096 33.5522 362.063 34.3208 360.646 34.3208H358.857C358.749 34.2968 358.713 34.2968 358.713 34.2127C358.713 34.1166 358.737 34.0206 358.761 33.9485L360.418 27.356Z" fill="black"/>
<path d="M370.405 30.9279C370.405 30.6577 370.189 30.5676 370.045 30.5676C369.756 30.5676 369.531 30.8468 369.531 31.063C369.531 31.2521 369.675 31.4232 369.9 31.4232C370.135 31.4232 370.405 31.1981 370.405 30.9279ZM368.486 36.953C368.315 37.6285 367.784 38.0878 367.262 38.0878C367.235 38.0878 366.982 38.0788 366.982 38.0428C366.982 38.0338 367.009 38.0158 367.009 38.0068C367.145 37.8987 367.208 37.7546 367.208 37.6195C367.208 37.3313 366.973 37.2683 366.847 37.2683C366.604 37.2683 366.334 37.4664 366.334 37.7996C366.334 38.277 366.901 38.385 367.271 38.385C367.874 38.385 368.955 38.0428 369.225 36.962L370.036 33.7378C370.054 33.6478 370.081 33.5667 370.081 33.4406C370.081 32.9363 369.63 32.576 369.054 32.576C368.036 32.576 367.406 33.7919 367.406 33.963C367.406 34.017 367.442 34.0891 367.568 34.0891C367.694 34.0891 367.703 34.053 367.766 33.9269C368.027 33.3415 368.531 32.8732 369.027 32.8732C369.297 32.8732 369.36 33.0984 369.36 33.3235C369.36 33.4406 369.342 33.5667 369.333 33.5937L368.486 36.953Z" fill="black"/>
<path d="M379.083 31.9911C379.275 31.9911 379.539 31.9911 379.539 31.7149C379.539 31.4388 379.275 31.4388 379.083 31.4388H373.727C373.883 29.7696 375.276 28.4967 377.101 28.4967H379.083C379.275 28.4967 379.539 28.4967 379.539 28.2205C379.539 27.9443 379.275 27.9443 379.083 27.9443H377.065C374.904 27.9443 373.163 29.6255 373.163 31.7149C373.163 33.8164 374.916 35.4855 377.065 35.4855H379.083C379.275 35.4855 379.539 35.4855 379.539 35.2094C379.539 34.9332 379.275 34.9332 379.083 34.9332H377.101C375.276 34.9332 373.883 33.6603 373.727 31.9911H379.083Z" fill="black"/>
<path d="M388.67 26.5874C388.682 26.5394 388.706 26.4673 388.706 26.4073C388.706 26.3232 388.634 26.2632 388.55 26.2632C388.465 26.2632 388.441 26.2872 388.309 26.4313C388.177 26.5874 387.865 26.9717 387.733 27.1158C387.265 26.4313 386.508 26.2632 385.896 26.2632C384.227 26.2632 382.798 27.6682 382.798 29.0371C382.798 29.7336 383.17 30.1419 383.23 30.2259C383.626 30.6462 383.974 30.7423 384.839 30.9464C385.259 31.0545 385.283 31.0545 385.632 31.1386C385.98 31.2226 386.748 31.4147 386.748 32.4114C386.748 33.4562 385.716 34.5729 384.455 34.5729C383.674 34.5729 382.245 34.3328 382.245 32.8437C382.245 32.8077 382.245 32.5555 382.317 32.2673L382.329 32.1473C382.329 32.0032 382.197 31.9911 382.149 31.9911C381.993 31.9911 381.981 32.0392 381.921 32.3154L381.429 34.2727C381.381 34.4528 381.297 34.7771 381.297 34.8131C381.297 34.9092 381.369 34.9692 381.453 34.9692C381.537 34.9692 381.549 34.9572 381.681 34.8011L382.257 34.1286C382.533 34.4889 383.218 34.9692 384.431 34.9692C386.172 34.9692 387.601 33.3961 387.601 31.9551C387.601 31.4027 387.421 30.9704 387.097 30.6342C386.736 30.2379 386.34 30.1419 385.776 29.9978C385.427 29.9137 384.959 29.8056 384.683 29.7336C384.323 29.6495 383.65 29.4094 383.65 28.5688C383.65 27.6321 384.659 26.6234 385.884 26.6234C386.964 26.6234 387.709 27.1878 387.709 28.4847C387.709 28.7729 387.661 29.0251 387.661 29.0731C387.661 29.2172 387.769 29.2412 387.853 29.2412C387.997 29.2412 388.009 29.1932 388.057 29.0011L388.67 26.5874Z" fill="black"/>
<path d="M407.197 10.9907C407.197 10.8286 407.071 10.8286 407.035 10.8286C406.963 10.8286 406.945 10.8466 406.728 11.1168C406.62 11.2429 405.882 12.1795 405.864 12.1976C405.269 11.0267 404.081 10.8286 403.324 10.8286C401.036 10.8286 398.965 12.9181 398.965 14.9535C398.965 16.3044 399.776 17.0969 400.658 17.4032C400.856 17.4752 401.919 17.7634 402.459 17.8895C403.378 18.1417 403.612 18.2137 403.99 18.61C404.063 18.7001 404.423 19.1143 404.423 19.9609C404.423 21.6361 402.874 23.3653 401.072 23.3653C399.595 23.3653 397.956 22.7348 397.956 20.7174C397.956 20.3752 398.028 19.9429 398.082 19.7628C398.082 19.7088 398.1 19.6187 398.1 19.5827C398.1 19.5106 398.064 19.4206 397.92 19.4206C397.758 19.4206 397.74 19.4566 397.668 19.7628L396.749 23.4553C396.749 23.4734 396.677 23.7075 396.677 23.7255C396.677 23.8876 396.822 23.8876 396.858 23.8876C396.93 23.8876 396.948 23.8696 397.164 23.5994L397.992 22.5187C398.425 23.1671 399.361 23.8876 401.036 23.8876C403.36 23.8876 405.486 21.6361 405.486 19.3845C405.486 18.628 405.305 17.9615 404.621 17.2951C404.243 16.9168 403.918 16.8268 402.261 16.3945C401.054 16.0702 400.892 16.0162 400.568 15.728C400.262 15.4218 400.028 14.9895 400.028 14.3771C400.028 12.864 401.559 11.3149 403.27 11.3149C405.035 11.3149 405.864 12.3957 405.864 14.1069C405.864 14.5752 405.774 15.0615 405.774 15.1336C405.774 15.2957 405.918 15.2957 405.972 15.2957C406.134 15.2957 406.152 15.2417 406.224 14.9535L407.197 10.9907Z" fill="black"/>
<path d="M412.752 20.9336C412.752 20.8435 412.68 20.7715 412.572 20.7715C412.41 20.7715 412.392 20.8255 412.302 21.1317C411.833 22.7708 411.095 23.3292 410.5 23.3292C410.284 23.3292 410.032 23.2752 410.032 22.7348C410.032 22.2485 410.248 21.7081 410.446 21.1677L411.707 17.8174C411.761 17.6733 411.887 17.3491 411.887 17.0069C411.887 16.2503 411.347 15.5659 410.464 15.5659C408.807 15.5659 408.141 18.1777 408.141 18.3218C408.141 18.3938 408.213 18.4839 408.339 18.4839C408.501 18.4839 408.519 18.4118 408.591 18.1597C409.023 16.6466 409.708 15.9261 410.41 15.9261C410.572 15.9261 410.879 15.9441 410.879 16.5205C410.879 16.9888 410.644 17.5833 410.5 17.9795L409.239 21.3298C409.131 21.618 409.023 21.9062 409.023 22.2305C409.023 23.041 409.582 23.6895 410.446 23.6895C412.103 23.6895 412.752 21.0597 412.752 20.9336ZM412.608 12.2696C412.608 11.9994 412.392 11.6752 411.995 11.6752C411.581 11.6752 411.113 12.0714 411.113 12.5398C411.113 12.9901 411.491 13.1342 411.707 13.1342C412.194 13.1342 412.608 12.6659 412.608 12.2696Z" fill="black"/>
<path d="M417.382 18.2318C417.418 18.1237 417.868 17.2231 418.535 16.6467C419.003 16.2144 419.615 15.9262 420.318 15.9262C421.038 15.9262 421.29 16.4665 421.29 17.187C421.29 17.2951 421.29 17.6554 421.074 18.5019L420.624 20.3572C420.48 20.8976 420.138 22.2305 420.102 22.4287C420.03 22.6988 419.922 23.1672 419.922 23.2392C419.922 23.4914 420.12 23.6895 420.39 23.6895C420.93 23.6895 421.02 23.2752 421.182 22.6268L422.263 18.3218C422.299 18.1777 423.236 15.9262 425.217 15.9262C425.938 15.9262 426.19 16.4665 426.19 17.187C426.19 18.1957 425.487 20.1591 425.091 21.2398C424.929 21.6721 424.839 21.9063 424.839 22.2305C424.839 23.0411 425.397 23.6895 426.262 23.6895C427.937 23.6895 428.567 21.0417 428.567 20.9336C428.567 20.8436 428.495 20.7715 428.387 20.7715C428.225 20.7715 428.207 20.8256 428.117 21.1318C427.703 22.5728 427.036 23.3293 426.316 23.3293C426.136 23.3293 425.848 23.3113 425.848 22.7349C425.848 22.2666 426.064 21.6902 426.136 21.492C426.46 20.6274 427.271 18.5019 427.271 17.4572C427.271 16.3765 426.64 15.5659 425.271 15.5659C424.064 15.5659 423.092 16.2504 422.371 17.3131C422.317 16.3405 421.723 15.5659 420.372 15.5659C418.769 15.5659 417.922 16.7007 417.598 17.151C417.544 16.1243 416.805 15.5659 416.013 15.5659C415.49 15.5659 415.076 15.8181 414.734 16.5026C414.41 17.151 414.158 18.2498 414.158 18.3218C414.158 18.3939 414.23 18.4839 414.356 18.4839C414.5 18.4839 414.518 18.4659 414.626 18.0516C414.896 16.9889 415.238 15.9262 415.959 15.9262C416.373 15.9262 416.517 16.2144 416.517 16.7547C416.517 17.151 416.337 17.8535 416.211 18.4119L415.707 20.3572C415.635 20.6995 415.436 21.51 415.346 21.8343C415.22 22.3026 415.022 23.1492 415.022 23.2392C415.022 23.4914 415.22 23.6895 415.49 23.6895C415.707 23.6895 415.959 23.5815 416.103 23.3113C416.139 23.2212 416.301 22.5908 416.391 22.2305L416.787 20.6094L417.382 18.2318Z" fill="black"/>
<path d="M432.485 23.7615C432.677 23.5814 433.181 23.1851 433.373 23.017C434.118 22.3325 434.826 21.6721 434.826 20.5793C434.826 19.1503 433.625 18.2257 432.124 18.2257C430.683 18.2257 429.735 19.3185 429.735 20.3872C429.735 20.9756 430.203 21.0597 430.371 21.0597C430.623 21.0597 430.996 20.8795 430.996 20.4232C430.996 19.7988 430.395 19.7988 430.251 19.7988C430.599 18.9222 431.404 18.622 431.992 18.622C433.109 18.622 433.686 19.5706 433.686 20.5793C433.686 21.8282 432.809 22.7408 431.392 24.1938L429.879 25.7549C429.735 25.887 429.735 25.911 429.735 26.2112H434.478L434.826 24.0617H434.454C434.418 24.3019 434.322 24.9023 434.178 25.1305C434.106 25.2265 433.193 25.2265 433.001 25.2265H430.864L432.485 23.7615Z" fill="black"/>
<path d="M442.082 27.8864C442.082 27.8323 442.082 27.7963 441.776 27.4901C439.975 25.6708 438.966 22.6988 438.966 19.0242C438.966 15.5298 439.813 12.5217 441.902 10.3963C442.082 10.2342 442.082 10.1981 442.082 10.1441C442.082 10.036 441.992 10 441.92 10C441.686 10 440.209 11.2969 439.327 13.0621C438.408 14.8814 437.994 16.8087 437.994 19.0242C437.994 20.6273 438.246 22.7708 439.182 24.6982C440.245 26.8597 441.722 28.0305 441.92 28.0305C441.992 28.0305 442.082 27.9944 442.082 27.8864Z" fill="black"/>
<path d="M445.915 22.1764C445.753 22.8068 445.717 22.9869 444.456 22.9869C444.114 22.9869 443.934 22.9869 443.934 23.3112C443.934 23.5093 444.042 23.5093 444.402 23.5093H450.112C453.751 23.5093 457.299 19.7447 457.299 15.7279C457.299 13.1341 455.75 11.2068 453.174 11.2068H447.392C447.05 11.2068 446.888 11.2068 446.888 11.549C446.888 11.7291 447.05 11.7291 447.32 11.7291C448.419 11.7291 448.419 11.8732 448.419 12.0714C448.419 12.1074 448.419 12.2155 448.347 12.4857L445.915 22.1764ZM449.716 12.4316C449.878 11.7652 449.95 11.7291 450.652 11.7291H452.634C454.327 11.7291 455.876 12.6478 455.876 15.1335C455.876 16.0341 455.516 19.1683 453.769 21.1497C453.264 21.7441 451.895 22.9869 449.824 22.9869H447.77C447.518 22.9869 447.482 22.9869 447.374 22.9689C447.176 22.9509 447.158 22.9149 447.158 22.7708C447.158 22.6447 447.194 22.5366 447.23 22.3745L449.716 12.4316Z" fill="black"/>
<path d="M461.309 18.718C461.309 18.4539 461.117 18.2617 460.841 18.2617C460.529 18.2617 460.181 18.5499 460.181 18.9102C460.181 19.1743 460.373 19.3665 460.649 19.3665C460.961 19.3665 461.309 19.0783 461.309 18.718ZM459.556 23.1251L458.908 24.7822C458.848 24.9624 458.788 25.1065 458.788 25.3106C458.788 25.899 459.244 26.3313 459.88 26.3313C461.045 26.3313 461.538 24.6501 461.538 24.494C461.538 24.3739 461.442 24.3379 461.357 24.3379C461.213 24.3379 461.189 24.422 461.153 24.5421C460.877 25.5027 460.385 25.9951 459.904 25.9951C459.76 25.9951 459.616 25.935 459.616 25.6108C459.616 25.3226 459.7 25.1065 459.856 24.7342C459.977 24.41 460.097 24.0857 460.229 23.7615L460.601 22.7888C460.709 22.5127 460.853 22.1404 460.853 21.9363C460.853 21.3358 460.373 20.9156 459.76 20.9156C458.596 20.9156 458.091 22.5967 458.091 22.7528C458.091 22.8609 458.175 22.9089 458.271 22.9089C458.427 22.9089 458.439 22.8369 458.475 22.7168C458.812 21.576 459.364 21.2518 459.724 21.2518C459.892 21.2518 460.013 21.3118 460.013 21.6481C460.013 21.7681 460.001 21.9363 459.88 22.2965L459.556 23.1251Z" fill="black"/>
<path d="M466.338 23.5813C466.338 22.5366 465.996 21.7621 465.257 21.7621C464.681 21.7621 464.392 22.2304 464.392 22.6267C464.392 23.023 464.663 23.5093 465.275 23.5093C465.509 23.5093 465.707 23.4372 465.87 23.2751C465.906 23.2391 465.924 23.2391 465.942 23.2391C465.978 23.2391 465.978 23.4913 465.978 23.5813C465.978 24.1758 465.87 25.3466 464.825 26.5174C464.627 26.7335 464.627 26.7695 464.627 26.8056C464.627 26.8956 464.717 26.9857 464.807 26.9857C464.951 26.9857 466.338 25.6528 466.338 23.5813Z" fill="black"/>
<path d="M473.555 22.1764C473.392 22.8068 473.356 22.9869 472.096 22.9869C471.753 22.9869 471.573 22.9869 471.573 23.3112C471.573 23.5093 471.681 23.5093 472.042 23.5093H477.751C481.39 23.5093 484.938 19.7447 484.938 15.7279C484.938 13.1341 483.389 11.2068 480.814 11.2068H475.032C474.689 11.2068 474.527 11.2068 474.527 11.549C474.527 11.7291 474.689 11.7291 474.96 11.7291C476.058 11.7291 476.058 11.8732 476.058 12.0714C476.058 12.1074 476.058 12.2155 475.986 12.4857L473.555 22.1764ZM477.355 12.4316C477.517 11.7652 477.589 11.7291 478.292 11.7291H480.273C481.966 11.7291 483.515 12.6478 483.515 15.1335C483.515 16.0341 483.155 19.1683 481.408 21.1497C480.904 21.7441 479.535 22.9869 477.463 22.9869H475.41C475.158 22.9869 475.122 22.9869 475.014 22.9689C474.815 22.9509 474.797 22.9149 474.797 22.7708C474.797 22.6447 474.833 22.5366 474.869 22.3745L477.355 12.4316Z" fill="black"/>
<path d="M490.33 18.718C490.33 18.4899 490.15 18.2617 489.862 18.2617C489.501 18.2617 489.189 18.598 489.189 18.9102C489.189 19.1383 489.369 19.3665 489.657 19.3665C490.018 19.3665 490.33 19.0302 490.33 18.718ZM487.82 26.8116C487.64 27.5441 487.052 28.3247 486.319 28.3247C486.127 28.3247 485.947 28.2766 485.923 28.2646C486.295 28.0845 486.343 27.7603 486.343 27.6522C486.343 27.364 486.127 27.2079 485.875 27.2079C485.527 27.2079 485.202 27.5081 485.202 27.9044C485.202 28.3607 485.647 28.6609 486.331 28.6609C487.064 28.6609 488.385 28.2046 488.745 26.7636L489.825 22.4646C489.862 22.3205 489.886 22.2245 489.886 22.0443C489.886 21.3839 489.357 20.9156 488.661 20.9156C487.388 20.9156 486.643 22.5967 486.643 22.7528C486.643 22.8609 486.727 22.9089 486.823 22.9089C486.956 22.9089 486.968 22.8729 487.052 22.6928C487.412 21.8642 488.024 21.2518 488.625 21.2518C488.877 21.2518 489.021 21.4199 489.021 21.8162C489.021 21.9843 488.985 22.1524 488.949 22.3205L487.82 26.8116Z" fill="black"/>
<path d="M497.052 19.0242C497.052 17.6553 496.872 15.4217 495.864 13.3323C494.801 11.1708 493.324 10 493.126 10C493.054 10 492.964 10.036 492.964 10.1441C492.964 10.1981 492.964 10.2342 493.27 10.5404C495.071 12.3596 496.08 15.3317 496.08 19.0062C496.08 22.5006 495.233 25.5087 493.144 27.6342C492.964 27.7963 492.964 27.8323 492.964 27.8864C492.964 27.9944 493.054 28.0305 493.126 28.0305C493.36 28.0305 494.837 26.7336 495.719 24.9684C496.638 23.1311 497.052 21.1857 497.052 19.0242Z" fill="black"/>
<path d="M501.624 10H499.21V10.6665H500.957V27.346H499.21V28.0125H501.624V10Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 50 KiB

@@ -0,0 +1,13 @@
<svg width="956" height="199" viewBox="0 0 956 199" fill="none" xmlns="http://www.w3.org/2000/svg">
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="0" y="19.8636">&#10;</tspan><tspan x="18" y="34.8636">meat | organic | food | beef | emissions | eat | of | eating | is&#10;</tspan><tspan x="18" y="64.8636">the | explosion | atmosphere | eruption | kilometers | of | &#10;</tspan><tspan x="0" y="79.8636">&#10;</tspan><tspan x="18" y="94.8636">immune | system | your | cells | my | and | is | the | how | of&#10;</tspan><tspan x="0" y="109.864">&#10;</tspan><tspan x="18" y="124.864">moon | earth | lunar | tides | the | water | orbit | base | moons &#10;</tspan><tspan x="0" y="139.864">&#10;</tspan><tspan x="18" y="154.864">eu | european | democratic | vote | parliament | member | union&#10;</tspan><tspan x="0" y="169.864">&#10;</tspan><tspan x="18" y="184.864">plastic | plastics | tons | pollution | waste | microplastics | polymers</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="0" y="49.8636">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="6" y="13.5909">Default Representation</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="452" y="19.8636">&#10;</tspan><tspan x="470" y="34.8636">meat | organic | beef | emissions | health | pesticides | foods | farming | conventional&#10;</tspan><tspan x="452" y="49.8636">&#10;</tspan><tspan x="470" y="64.8636">explosion | atmosphere | eruption | eruptions | crust | volcanoes | earthquakes&#10;</tspan><tspan x="452" y="79.8636">&#10;</tspan><tspan x="470" y="94.8636">immune | system | cells | immunology | adaptive | body | memory | antibodies&#10;</tspan><tspan x="452" y="109.864">&#10;</tspan><tspan x="470" y="124.864">moon | lunar | tides | moons | surface | gravity | tide | meters | oceans | dust&#10;</tspan><tspan x="452" y="139.864">&#10;</tspan><tspan x="470" y="154.864">eu | democratic | vote | parliament | citizen | laws | institutions | influence | nations&#10;</tspan><tspan x="452" y="169.864">&#10;</tspan><tspan x="470" y="184.864">plastics | tons | pollution | waste | microplastics | polymers | ocean | bpa | cotton</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="458" y="17.5909">MaximalMarginalRelevance</tspan></text>
<path d="M448.354 182.354C448.549 182.158 448.549 181.842 448.354 181.646L445.172 178.464C444.976 178.269 444.66 178.269 444.464 178.464C444.269 178.66 444.269 178.976 444.464 179.172L447.293 182L444.464 184.828C444.269 185.024 444.269 185.34 444.464 185.536C444.66 185.731 444.976 185.731 445.172 185.536L448.354 182.354ZM418 182.5H448V181.5H418V182.5Z" fill="black"/>
<path d="M448.354 152.354C448.549 152.158 448.549 151.842 448.354 151.646L445.172 148.464C444.976 148.269 444.66 148.269 444.464 148.464C444.269 148.66 444.269 148.976 444.464 149.172L447.293 152L444.464 154.828C444.269 155.024 444.269 155.34 444.464 155.536C444.66 155.731 444.976 155.731 445.172 155.536L448.354 152.354ZM418 152.5H448V151.5H418V152.5Z" fill="black"/>
<path d="M448.354 122.354C448.549 122.158 448.549 121.842 448.354 121.646L445.172 118.464C444.976 118.269 444.66 118.269 444.464 118.464C444.269 118.66 444.269 118.976 444.464 119.172L447.293 122L444.464 124.828C444.269 125.024 444.269 125.34 444.464 125.536C444.66 125.731 444.976 125.731 445.172 125.536L448.354 122.354ZM418 122.5H448V121.5H418V122.5Z" fill="black"/>
<path d="M448.354 92.3536C448.549 92.1583 448.549 91.8417 448.354 91.6464L445.172 88.4645C444.976 88.2692 444.66 88.2692 444.464 88.4645C444.269 88.6597 444.269 88.9763 444.464 89.1716L447.293 92L444.464 94.8284C444.269 95.0237 444.269 95.3403 444.464 95.5355C444.66 95.7308 444.976 95.7308 445.172 95.5355L448.354 92.3536ZM418 92.5H448V91.5H418V92.5Z" fill="black"/>
<path d="M448.354 62.3536C448.549 62.1583 448.549 61.8417 448.354 61.6464L445.172 58.4645C444.976 58.2692 444.66 58.2692 444.464 58.4645C444.269 58.6597 444.269 58.9763 444.464 59.1716L447.293 62L444.464 64.8284C444.269 65.0237 444.269 65.3403 444.464 65.5355C444.66 65.7308 444.976 65.7308 445.172 65.5355L448.354 62.3536ZM418 62.5H448V61.5H418V62.5Z" fill="black"/>
<path d="M448.354 32.3536C448.549 32.1583 448.549 31.8417 448.354 31.6464L445.172 28.4645C444.976 28.2692 444.66 28.2692 444.464 28.4645C444.269 28.6597 444.269 28.9763 444.464 29.1716L447.293 32L444.464 34.8284C444.269 35.0237 444.269 35.3403 444.464 35.5355C444.66 35.7308 444.976 35.7308 445.172 35.5355L448.354 32.3536ZM418 32.5H448V31.5H418V32.5Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 4.8 KiB

@@ -0,0 +1,13 @@
<svg width="956" height="199" viewBox="0 0 956 199" fill="none" xmlns="http://www.w3.org/2000/svg">
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="0" y="19.8636">&#10;</tspan><tspan x="18" y="34.8636">meat | organic | food | beef | emissions | eat | of | eating | is&#10;</tspan><tspan x="18" y="64.8636">the | explosion | atmosphere | eruption | kilometers | of | &#10;</tspan><tspan x="0" y="79.8636">&#10;</tspan><tspan x="18" y="94.8636">immune | system | your | cells | my | and | is | the | how | of&#10;</tspan><tspan x="0" y="109.864">&#10;</tspan><tspan x="18" y="124.864">moon | earth | lunar | tides | the | water | orbit | base | moons &#10;</tspan><tspan x="0" y="139.864">&#10;</tspan><tspan x="18" y="154.864">eu | european | democratic | vote | parliament | member | union&#10;</tspan><tspan x="0" y="169.864">&#10;</tspan><tspan x="18" y="184.864">plastic | plastics | tons | pollution | waste | microplastics | polymers</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="0" y="49.8636">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="6" y="13.5909">Default Representation</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="452" y="19.8636">&#10;</tspan><tspan x="470" y="34.8636">Organic vs Conventional Food: Environmental and Health Considerations&#10;</tspan><tspan x="452" y="49.8636">&#10;</tspan><tspan x="470" y="64.8636">Volcanic Eruptions and Impacts&#10;</tspan><tspan x="452" y="79.8636">&#10;</tspan><tspan x="470" y="94.8636">The Immune System: Understanding and Boosting Immunity&#10;</tspan><tspan x="452" y="109.864">&#10;</tspan><tspan x="470" y="124.864">The Moon&#39;s Tides and Orbit Phenomena&#10;</tspan><tspan x="452" y="139.864">&#10;</tspan><tspan x="470" y="154.864">Democracy in the European Union&#10;</tspan><tspan x="452" y="169.864">&#10;</tspan><tspan x="470" y="184.864">Plastic Pollution and its environmental impact</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="458" y="17.5909">OpenAI</tspan></text>
<path d="M448.354 182.354C448.549 182.158 448.549 181.842 448.354 181.646L445.172 178.464C444.976 178.269 444.66 178.269 444.464 178.464C444.269 178.66 444.269 178.976 444.464 179.172L447.293 182L444.464 184.828C444.269 185.024 444.269 185.34 444.464 185.536C444.66 185.731 444.976 185.731 445.172 185.536L448.354 182.354ZM418 182.5H448V181.5H418V182.5Z" fill="black"/>
<path d="M448.354 152.354C448.549 152.158 448.549 151.842 448.354 151.646L445.172 148.464C444.976 148.269 444.66 148.269 444.464 148.464C444.269 148.66 444.269 148.976 444.464 149.172L447.293 152L444.464 154.828C444.269 155.024 444.269 155.34 444.464 155.536C444.66 155.731 444.976 155.731 445.172 155.536L448.354 152.354ZM418 152.5H448V151.5H418V152.5Z" fill="black"/>
<path d="M448.354 122.354C448.549 122.158 448.549 121.842 448.354 121.646L445.172 118.464C444.976 118.269 444.66 118.269 444.464 118.464C444.269 118.66 444.269 118.976 444.464 119.172L447.293 122L444.464 124.828C444.269 125.024 444.269 125.34 444.464 125.536C444.66 125.731 444.976 125.731 445.172 125.536L448.354 122.354ZM418 122.5H448V121.5H418V122.5Z" fill="black"/>
<path d="M448.354 92.3536C448.549 92.1583 448.549 91.8417 448.354 91.6464L445.172 88.4645C444.976 88.2692 444.66 88.2692 444.464 88.4645C444.269 88.6597 444.269 88.9763 444.464 89.1716L447.293 92L444.464 94.8284C444.269 95.0237 444.269 95.3403 444.464 95.5355C444.66 95.7308 444.976 95.7308 445.172 95.5355L448.354 92.3536ZM418 92.5H448V91.5H418V92.5Z" fill="black"/>
<path d="M448.354 62.3536C448.549 62.1583 448.549 61.8417 448.354 61.6464L445.172 58.4645C444.976 58.2692 444.66 58.2692 444.464 58.4645C444.269 58.6597 444.269 58.9763 444.464 59.1716L447.293 62L444.464 64.8284C444.269 65.0237 444.269 65.3403 444.464 65.5355C444.66 65.7308 444.976 65.7308 445.172 65.5355L448.354 62.3536ZM418 62.5H448V61.5H418V62.5Z" fill="black"/>
<path d="M448.354 32.3536C448.549 32.1583 448.549 31.8417 448.354 31.6464L445.172 28.4645C444.976 28.2692 444.66 28.2692 444.464 28.4645C444.269 28.6597 444.269 28.9763 444.464 29.1716L447.293 32L444.464 34.8284C444.269 35.0237 444.269 35.3403 444.464 35.5355C444.66 35.7308 444.976 35.7308 445.172 35.5355L448.354 32.3536ZM418 32.5H448V31.5H418V32.5Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 4.6 KiB

@@ -0,0 +1,17 @@
<svg width="736" height="66" viewBox="0 0 736 66" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="0.5" y="12.5" width="69" height="39" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="47.2617" y="35.7637"> n</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="15" y="35.7637">Topic</tspan></text>
<rect x="250.5" y="3.5" width="137" height="58" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="265.062" y="19.8636">Extract documents </tspan><tspan x="261.992" y="34.8636">that contain at least </tspan><tspan x="282.183" y="49.8636">one keyword</tspan></text>
<rect x="605.5" y="12.5" width="130" height="44" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="620.495" y="28.8636">Sort keywords by </tspan><tspan x="613.481" y="43.8636">their c-TF-IDF value</tspan></text>
<rect x="419.5" y="0.5" width="153" height="65" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="425.06" y="15.8636">Use the POS matcher on </tspan><tspan x="438.109" y="30.8636">those documents to </tspan><tspan x="426.039" y="45.8636">generate new candidate </tspan><tspan x="467.628" y="60.8636">keywords</tspan></text>
<path d="M93.7071 33.7071C94.0976 33.3166 94.0976 32.6834 93.7071 32.2929L87.3431 25.9289C86.9526 25.5384 86.3195 25.5384 85.9289 25.9289C85.5384 26.3195 85.5384 26.9526 85.9289 27.3431L91.5858 33L85.9289 38.6569C85.5384 39.0474 85.5384 39.6805 85.9289 40.0711C86.3195 40.4616 86.9526 40.4616 87.3431 40.0711L93.7071 33.7071ZM75 34L93 34V32L75 32V34Z" fill="black"/>
<path d="M243.707 33.7071C244.098 33.3166 244.098 32.6834 243.707 32.2929L237.343 25.9289C236.953 25.5384 236.319 25.5384 235.929 25.9289C235.538 26.3195 235.538 26.9526 235.929 27.3431L241.586 33L235.929 38.6569C235.538 39.0474 235.538 39.6805 235.929 40.0711C236.319 40.4616 236.953 40.4616 237.343 40.0711L243.707 33.7071ZM225 34L243 34V32L225 32V34Z" fill="black"/>
<path d="M411.707 33.7071C412.098 33.3166 412.098 32.6834 411.707 32.2929L405.343 25.9289C404.953 25.5384 404.319 25.5384 403.929 25.9289C403.538 26.3195 403.538 26.9526 403.929 27.3431L409.586 33L403.929 38.6569C403.538 39.0474 403.538 39.6805 403.929 40.0711C404.319 40.4616 404.953 40.4616 405.343 40.0711L411.707 33.7071ZM393 34L411 34V32L393 32V34Z" fill="black"/>
<path d="M597.707 33.7071C598.098 33.3166 598.098 32.6834 597.707 32.2929L591.343 25.9289C590.953 25.5384 590.319 25.5384 589.929 25.9289C589.538 26.3195 589.538 26.9526 589.929 27.3431L595.586 33L589.929 38.6569C589.538 39.0474 589.538 39.6805 589.929 40.0711C590.319 40.4616 590.953 40.4616 591.343 40.0711L597.707 33.7071ZM579 34L597 34V32L579 32V34Z" fill="black"/>
<rect x="100.5" y="11.5" width="119" height="39" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="109.606" y="27.8636">Extract candidate </tspan><tspan x="132.381" y="42.8636">keywords</tspan></text>
</svg>

After

Width:  |  Height:  |  Size: 3.4 KiB

@@ -0,0 +1,14 @@
<svg width="934" height="199" viewBox="0 0 934 199" fill="none" xmlns="http://www.w3.org/2000/svg">
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="0" y="19.8636">&#10;</tspan><tspan x="18" y="34.8636">meat | organic | food | beef | emissions | eat | of | eating | is&#10;</tspan><tspan x="18" y="64.8636">the | explosion | atmosphere | eruption | kilometers | of | &#10;</tspan><tspan x="0" y="79.8636">&#10;</tspan><tspan x="18" y="94.8636">immune | system | your | cells | my | and | is | the | how | of&#10;</tspan><tspan x="0" y="109.864">&#10;</tspan><tspan x="18" y="124.864">moon | earth | lunar | tides | the | water | orbit | base | moons &#10;</tspan><tspan x="0" y="139.864">&#10;</tspan><tspan x="18" y="154.864">eu | european | democratic | vote | parliament | member | union&#10;</tspan><tspan x="0" y="169.864">&#10;</tspan><tspan x="18" y="184.864">plastic | plastics | tons | pollution | waste | microplastics | polymers</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="0" y="49.8636">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="6" y="13.5909">Default Representation</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="452" y="19.8636">&#10;</tspan><tspan x="470" y="34.8636">meat | organic | food | beef | emissions | most | health | pesticides | production&#10;</tspan><tspan x="452" y="49.8636">&#10;</tspan><tspan x="470" y="64.8636">explosion | atmosphere | eruption | kilometers | eruptions | fireball | super&#10;</tspan><tspan x="470" y="94.8636">immune | system | cells | immunology | adaptive | body | memory | cell &#10;</tspan><tspan x="452" y="109.864">&#10;</tspan><tspan x="470" y="124.864">moon | earth | lunar | tides | water | orbit | base | moons | surface | gravity&#10;</tspan><tspan x="452" y="139.864">&#10;</tspan><tspan x="470" y="154.864">democratic | vote | parliament | member | union | states | national | countries &#10;</tspan><tspan x="452" y="169.864">&#10;</tspan><tspan x="470" y="184.864">plastic | plastics | tons | pollution | waste | microplastics | polymers | bag</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="452" y="79.8636">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="458" y="17.5909">PartOfSpeech</tspan></text>
<path d="M448.354 182.354C448.549 182.158 448.549 181.842 448.354 181.646L445.172 178.464C444.976 178.269 444.66 178.269 444.464 178.464C444.269 178.66 444.269 178.976 444.464 179.172L447.293 182L444.464 184.828C444.269 185.024 444.269 185.34 444.464 185.536C444.66 185.731 444.976 185.731 445.172 185.536L448.354 182.354ZM418 182.5H448V181.5H418V182.5Z" fill="black"/>
<path d="M448.354 152.354C448.549 152.158 448.549 151.842 448.354 151.646L445.172 148.464C444.976 148.269 444.66 148.269 444.464 148.464C444.269 148.66 444.269 148.976 444.464 149.172L447.293 152L444.464 154.828C444.269 155.024 444.269 155.34 444.464 155.536C444.66 155.731 444.976 155.731 445.172 155.536L448.354 152.354ZM418 152.5H448V151.5H418V152.5Z" fill="black"/>
<path d="M448.354 122.354C448.549 122.158 448.549 121.842 448.354 121.646L445.172 118.464C444.976 118.269 444.66 118.269 444.464 118.464C444.269 118.66 444.269 118.976 444.464 119.172L447.293 122L444.464 124.828C444.269 125.024 444.269 125.34 444.464 125.536C444.66 125.731 444.976 125.731 445.172 125.536L448.354 122.354ZM418 122.5H448V121.5H418V122.5Z" fill="black"/>
<path d="M448.354 92.3536C448.549 92.1583 448.549 91.8417 448.354 91.6464L445.172 88.4645C444.976 88.2692 444.66 88.2692 444.464 88.4645C444.269 88.6597 444.269 88.9763 444.464 89.1716L447.293 92L444.464 94.8284C444.269 95.0237 444.269 95.3403 444.464 95.5355C444.66 95.7308 444.976 95.7308 445.172 95.5355L448.354 92.3536ZM418 92.5H448V91.5H418V92.5Z" fill="black"/>
<path d="M448.354 62.3536C448.549 62.1583 448.549 61.8417 448.354 61.6464L445.172 58.4645C444.976 58.2692 444.66 58.2692 444.464 58.4645C444.269 58.6597 444.269 58.9763 444.464 59.1716L447.293 62L444.464 64.8284C444.269 65.0237 444.269 65.3403 444.464 65.5355C444.66 65.7308 444.976 65.7308 445.172 65.5355L448.354 62.3536ZM418 62.5H448V61.5H418V62.5Z" fill="black"/>
<path d="M448.354 32.3536C448.549 32.1583 448.549 31.8417 448.354 31.6464L445.172 28.4645C444.976 28.2692 444.66 28.2692 444.464 28.4645C444.269 28.6597 444.269 28.9763 444.464 29.1716L447.293 32L444.464 34.8284C444.269 35.0237 444.269 35.3403 444.464 35.5355C444.66 35.7308 444.976 35.7308 445.172 35.5355L448.354 32.3536ZM418 32.5H448V31.5H418V32.5Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 4.9 KiB

@@ -0,0 +1,245 @@
One of the core components of BERTopic is its Bag-of-Words representation and weighting with c-TF-IDF. This method is fast and can quickly generate a number of keywords for a topic without depending on the clustering task. As a result, topics can easily and quickly be updated after training the model without the need to re-train it.
Although these give good topic representations, we may want to further fine-tune the topic representations.
As such, there are a number of representation models implemented in BERTopic that allows for further fine-tuning of the topic representations. These are optional
and are **not used by default**. You are not restrained by the how the representation can be fine-tuned, from GPT-like models to fast keyword extraction
with KeyBERT-like models:
<iframe width="1200" height="500" src="https://user-images.githubusercontent.com/25746895/218417067-a81cc179-9055-49ba-a2b0-f2c1db535159.mp4
" title="BERTopic Overview" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
For each model below, an example will be shown on how it may change or improve upon the default topic keywords that are generated. The dataset used in these examples can be found [here](https://www.kaggle.com/datasets/maartengr/kurzgesagt-transcriptions).
If you want to have multiple representations of a single topic, it might be worthwhile to also check out [**multi-aspect**](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) topic modeling with BERTopic.
## **KeyBERTInspired**
After having generated our topics with c-TF-IDF, we might want to do some fine-tuning based on the semantic
relationship between keywords/keyphrases and the set of documents in each topic. Although we can use a centroid-based
technique for this, it can be costly and does not take the structure of a cluster into account. Instead, we leverage
c-TF-IDF to create a set of representative documents per topic and use those as our updated topic embedding. Then, we calculate
the similarity between candidate keywords and the topic embedding using the same embedding model that embedded the documents.
<br>
<div class="svg_image">
--8<-- "docs/getting_started/representation/keybertinspired.svg"
</div>
<br>
Thus, the algorithm follows some principles of [KeyBERT](https://github.com/MaartenGr/KeyBERT) but does some optimization in
order to speed up inference. Usage is straightforward:
```python
from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
# Create your representation model
representation_model = KeyBERTInspired()
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
```
<br>
<div class="svg_image">
--8<-- "docs/getting_started/representation/keybert.svg"
</div>
<br>
## **PartOfSpeech**
Our candidate topics, as extracted with c-TF-IDF, do not take into account a keyword's part of speech as extracting noun-phrases from
all documents can be computationally quite expensive. Instead, we can leverage c-TF-IDF to perform part of speech on a subset of
keywords and documents that best represent a topic.
<br>
<div class="svg_image">
--8<-- "docs/getting_started/representation/partofspeech.svg"
</div>
<br>
More specifically, we find documents that contain the keywords from our candidate topics as calculated with c-TF-IDF. These documents serve
as the representative set of documents from which the Spacy model can extract a set of candidate keywords for each topic.
These candidate keywords are first put through Spacy's POS module to see whether they match with the `DEFAULT_PATTERNS`:
```python
DEFAULT_PATTERNS = [
[{'POS': 'ADJ'}, {'POS': 'NOUN'}],
[{'POS': 'NOUN'}],
[{'POS': 'ADJ'}]
]
```
These patterns follow Spacy's [Rule-Based Matching](https://spacy.io/usage/rule-based-matching). Then, the resulting keywords are sorted by
their respective c-TF-IDF values.
```python
from bertopic.representation import PartOfSpeech
from bertopic import BERTopic
# Create your representation model
representation_model = PartOfSpeech("en_core_web_sm")
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
```
<br>
<div class="svg_image">
--8<-- "docs/getting_started/representation/pos.svg"
</div>
<br>
You can define custom POS patterns to be extracted:
```python
pos_patterns = [
[{'POS': 'ADJ'}, {'POS': 'NOUN'}],
[{'POS': 'NOUN'}], [{'POS': 'ADJ'}]
]
representation_model = PartOfSpeech("en_core_web_sm", pos_patterns=pos_patterns)
```
## **MaximalMarginalRelevance**
When we calculate the weights of keywords, we typically do not consider whether we already have similar keywords in our topic. Words like "car" and "cars"
essentially represent the same information and often redundant.
<br>
<div class="svg_image">
--8<-- "docs/getting_started/representation/mmr.svg"
</div>
<br>
<!-- MMR = arg \underset{D_i\in R\setminus S}{max} [\lambda Sim_{1}(D_{i}, Q) - (1-\lambda) \,\, \underset{D_{j}\in S}{max} \,\, Sim_{2}(D_{i}, D_{j})] -->
To decrease this redundancy and improve the diversity of keywords, we can use an algorithm called Maximal Marginal Relevance (MMR). MMR considers the similarity of keywords/keyphrases with the document, along with the similarity of already selected keywords and keyphrases. This results in a selection of keywords
that maximize their within diversity with respect to the document.
```python
from bertopic.representation import MaximalMarginalRelevance
from bertopic import BERTopic
# Create your representation model
representation_model = MaximalMarginalRelevance(diversity=0.3)
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
```
<br>
<div class="svg_image">
--8<-- "docs/getting_started/representation/mmr_output.svg"
</div>
<br>
## **Zero-Shot Classification**
For some use cases, you might already have a set of candidate labels that you would like to automatically assign to some of the topics.
Although we can use guided or supervised BERTopic for that, we can also use zero-shot classification to assign labels to our topics.
For that, we can make use of 🤗 transformers on their models on the [model hub](https://huggingface.co/models?pipeline_tag=zero-shot-classification&sort=downloads).
To perform this classification, we feed the model with the keywords as generated through c-TF-IDF and a set of candidate labels.
If, for a certain topic, we find a similar enough label, then it is assigned. If not, then we keep the original c-TF-IDF keywords.
We use it in BERTopic as follows:
```python
from bertopic.representation import ZeroShotClassification
from bertopic import BERTopic
# Create your representation model
candidate_topics = ["space and nasa", "bicycles", "sports"]
representation_model = ZeroShotClassification(candidate_topics, model="facebook/bart-large-mnli")
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
```
<br>
<div class="svg_image">
--8<-- "docs/getting_started/representation/zero.svg"
</div>
<br>
## **Chain Models**
All of the above models can make use of the candidate topics, as generated by c-TF-IDF, to further fine-tune the topic representations. For example, `MaximalMarginalRelevance` takes the keywords in the candidate topics and re-ranks them. Similarly, the keywords in the candidate topic can be used as the input for GPT-prompts in `OpenAI`.
Although the default candidate topics are generated by c-TF-IDF, what if we were to chain these models? For example, we can use `MaximalMarginalRelevance` to improve upon the keywords in each topic before passing them to `OpenAI`.
This is supported in BERTopic by simply passing a list of representation models when instantiation the topic model:
```python
from bertopic.representation import MaximalMarginalRelevance, OpenAI
from bertopic import BERTopic
import openai
# Create your representation models
client = openai.OpenAI(api_key="sk-...")
openai_generator = OpenAI(client)
mmr = MaximalMarginalRelevance(diversity=0.3)
representation_models = [mmr, openai_generator]
# Use the chained models
topic_model = BERTopic(representation_model=representation_models)
```
## **Custom Model**
Although several representation models have been implemented in BERTopic, new technologies get released often and we should not have to wait until they get implemented in BERTopic. Therefore, you can create your own representation model and use that to fine-tune the topics.
The following is the basic structure for creating your custom model. Note that it returns the same topics as the those
calculated with c-TF-IDF:
```python
from bertopic.representation._base import BaseRepresentation
class CustomRepresentationModel(BaseRepresentation):
def extract_topics(self, topic_model, documents, c_tf_idf, topics
) -> Mapping[str, List[Tuple[str, float]]]:
""" Extract topics
Arguments:
topic_model: The BERTopic model
documents: A dataframe of documents with their related topics
c_tf_idf: The c-TF-IDF matrix
topics: The candidate topics as calculated with c-TF-IDF
Returns:
updated_topics: Updated topic representations
"""
updated_topics = topics.copy()
return updated_topics
```
Then, we can use that model as follows:
```python
from bertopic import BERTopic
# Create our custom representation model
representation_model = CustomRepresentationModel()
# Pass our custom representation model to BERTopic
topic_model = BERTopic(representation_model=representation_model)
```
There are a few things to take into account when creating your custom model:
* It needs to have the exact same parameter input: `topic_model`, `documents`, `c_tf_idf`, `topics`.
* Make sure that `updated_topics` has the exact same structure as `topics`:
```python
updated_topics = {
"1", [("space", 0.9), ("nasa", 0.7)],
"2": [("science", 0.66), ("article", 0.6)]
}
```
!!! Tip
You can change the `__init__` however you want, it does not influence the underlying structure. This
also means that you can save data/embeddings/representations/sentiment in your custom representation
model.
@@ -0,0 +1,53 @@
<svg width="445" height="248" viewBox="0 0 445 248" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="132" y="210" width="118" height="38" fill="#64B5F6"/>
<rect x="224" y="200" width="20" height="8" fill="#64B5F6"/>
<rect x="196" y="200" width="20" height="8" fill="#64B5F6"/>
<rect x="168" y="200" width="20" height="8" fill="#64B5F6"/>
<rect x="140" y="200" width="20" height="8" fill="#64B5F6"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="158.256" y="237.939">SBERT</tspan></text>
<rect x="132" y="170" width="118" height="38" fill="#E57373"/>
<rect x="224" y="160" width="20" height="8" fill="#E57373"/>
<rect x="196" y="160" width="20" height="8" fill="#E57373"/>
<rect x="168" y="160" width="20" height="8" fill="#E57373"/>
<rect x="140" y="160" width="20" height="8" fill="#E57373"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="161.254" y="197.939">UMAP</tspan></text>
<rect x="132" y="130" width="118" height="38" fill="#4DB6AC"/>
<rect x="224" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="196" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="168" y="120" width="20" height="8" fill="#4DB6AC"/>
<rect x="140" y="120" width="20" height="8" fill="#4DB6AC"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="141.342" y="157.939">HDBSCAN</tspan></text>
<rect y="10" width="118" height="38" fill="#3F51B5"/>
<rect x="92" width="20" height="8" fill="#3F51B5"/>
<rect x="64" width="20" height="8" fill="#3F51B5"/>
<rect x="36" width="20" height="8" fill="#3F51B5"/>
<rect x="8" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="14.7324" y="35.9395">GPT / T5</tspan></text>
<rect x="132" y="90" width="118" height="38" fill="#FFD54F"/>
<rect x="224" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="196" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="168" y="80" width="20" height="8" fill="#FFD54F"/>
<rect x="140" y="80" width="20" height="8" fill="#FFD54F"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="13" font-weight="bold" letter-spacing="0em"><tspan x="138.346" y="113.161">CountVectorizer</tspan></text>
<rect x="132" y="50" width="118" height="38" fill="#90A4AE"/>
<rect x="224" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="196" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="168" y="40" width="20" height="8" fill="#90A4AE"/>
<rect x="140" y="40" width="20" height="8" fill="#90A4AE"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="146.938" y="77.9395">c-TF-IDF</tspan></text>
<rect x="132" y="10" width="118" height="38" fill="#3F51B5"/>
<rect x="224" width="20" height="8" fill="#3F51B5"/>
<rect x="196" width="20" height="8" fill="#3F51B5"/>
<rect x="168" width="20" height="8" fill="#3F51B5"/>
<rect x="140" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="145.932" y="34.9395">KeyBERT</tspan></text>
<rect x="327" y="10" width="118" height="38" fill="#3F51B5"/>
<rect x="419" width="20" height="8" fill="#3F51B5"/>
<rect x="391" width="20" height="8" fill="#3F51B5"/>
<rect x="363" width="20" height="8" fill="#3F51B5"/>
<rect x="335" width="20" height="8" fill="#3F51B5"/>
<text fill="white" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="20" font-weight="bold" letter-spacing="0em"><tspan x="360.863" y="34.9395">MMR</tspan></text>
<circle cx="266.5" cy="28.5" r="5.5" fill="black"/>
<circle cx="285.5" cy="28.5" r="5.5" fill="black"/>
<circle cx="307.5" cy="28.5" r="5.5" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 4.1 KiB

@@ -0,0 +1,13 @@
<svg width="862" height="199" viewBox="0 0 862 199" fill="none" xmlns="http://www.w3.org/2000/svg">
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="0" y="19.8636">&#10;</tspan><tspan x="18" y="34.8636">meat | organic | food | beef | emissions | eat | of | eating | is&#10;</tspan><tspan x="18" y="64.8636">the | explosion | atmosphere | eruption | kilometers | of | &#10;</tspan><tspan x="0" y="79.8636">&#10;</tspan><tspan x="18" y="94.8636">immune | system | your | cells | my | and | is | the | how | of&#10;</tspan><tspan x="0" y="109.864">&#10;</tspan><tspan x="18" y="124.864">moon | earth | lunar | tides | the | water | orbit | base | moons &#10;</tspan><tspan x="0" y="139.864">&#10;</tspan><tspan x="18" y="154.864">eu | european | democratic | vote | parliament | member | union&#10;</tspan><tspan x="0" y="169.864">&#10;</tspan><tspan x="18" y="184.864">plastic | plastics | tons | pollution | waste | microplastics | polymers</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" font-weight="bold" letter-spacing="0em"><tspan x="0" y="49.8636">&#10;</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="6" y="13.5909">Default Representation</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="452" y="19.8636">&#10;</tspan><tspan x="470" y="34.8636">Organic food&#10;</tspan><tspan x="452" y="49.8636">&#10;</tspan><tspan x="470" y="64.8636">the | explosion | atmosphere | eruption | kilometers | of&#10;</tspan><tspan x="452" y="79.8636">&#10;</tspan><tspan x="470" y="94.8636">Your immune system&#10;</tspan><tspan x="452" y="109.864">&#10;</tspan><tspan x="470" y="124.864">moon | earth | lunar | tides | the | water | orbit | base | moons&#10;</tspan><tspan x="452" y="139.864">&#10;</tspan><tspan x="470" y="154.864">eu | european | democratic | vote | parliament | member | union&#10;</tspan><tspan x="452" y="169.864">&#10;</tspan><tspan x="470" y="184.864">plastic | plastics | tons | pollution | waste | microplastics | polymers </tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="14" font-weight="bold" letter-spacing="0em"><tspan x="458" y="17.5909">ZeroShotClassification</tspan></text>
<path d="M448.354 182.354C448.549 182.158 448.549 181.842 448.354 181.646L445.172 178.464C444.976 178.269 444.66 178.269 444.464 178.464C444.269 178.66 444.269 178.976 444.464 179.172L447.293 182L444.464 184.828C444.269 185.024 444.269 185.34 444.464 185.536C444.66 185.731 444.976 185.731 445.172 185.536L448.354 182.354ZM418 182.5H448V181.5H418V182.5Z" fill="black"/>
<path d="M448.354 152.354C448.549 152.158 448.549 151.842 448.354 151.646L445.172 148.464C444.976 148.269 444.66 148.269 444.464 148.464C444.269 148.66 444.269 148.976 444.464 149.172L447.293 152L444.464 154.828C444.269 155.024 444.269 155.34 444.464 155.536C444.66 155.731 444.976 155.731 445.172 155.536L448.354 152.354ZM418 152.5H448V151.5H418V152.5Z" fill="black"/>
<path d="M448.354 122.354C448.549 122.158 448.549 121.842 448.354 121.646L445.172 118.464C444.976 118.269 444.66 118.269 444.464 118.464C444.269 118.66 444.269 118.976 444.464 119.172L447.293 122L444.464 124.828C444.269 125.024 444.269 125.34 444.464 125.536C444.66 125.731 444.976 125.731 445.172 125.536L448.354 122.354ZM418 122.5H448V121.5H418V122.5Z" fill="black"/>
<path d="M448.354 92.3536C448.549 92.1583 448.549 91.8417 448.354 91.6464L445.172 88.4645C444.976 88.2692 444.66 88.2692 444.464 88.4645C444.269 88.6597 444.269 88.9763 444.464 89.1716L447.293 92L444.464 94.8284C444.269 95.0237 444.269 95.3403 444.464 95.5355C444.66 95.7308 444.976 95.7308 445.172 95.5355L448.354 92.3536ZM418 92.5H448V91.5H418V92.5Z" fill="black"/>
<path d="M448.354 62.3536C448.549 62.1583 448.549 61.8417 448.354 61.6464L445.172 58.4645C444.976 58.2692 444.66 58.2692 444.464 58.4645C444.269 58.6597 444.269 58.9763 444.464 59.1716L447.293 62L444.464 64.8284C444.269 65.0237 444.269 65.3403 444.464 65.5355C444.66 65.7308 444.976 65.7308 445.172 65.5355L448.354 62.3536ZM418 62.5H448V61.5H418V62.5Z" fill="black"/>
<path d="M448.354 32.3536C448.549 32.1583 448.549 31.8417 448.354 31.6464L445.172 28.4645C444.976 28.2692 444.66 28.2692 444.464 28.4645C444.269 28.6597 444.269 28.9763 444.464 29.1716L447.293 32L444.464 34.8284C444.269 35.0237 444.269 35.3403 444.464 35.5355C444.66 35.7308 444.976 35.7308 445.172 35.5355L448.354 32.3536ZM418 32.5H448V31.5H418V32.5Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 4.6 KiB

@@ -0,0 +1,40 @@
After having created a BERTopic model, you might end up with over a hundred topics. Searching through those
can be quite cumbersome especially if you are searching for a specific topic. Fortunately, BERTopic allows you
to search for topics using search terms. First, let's create and train a BERTopic model:
```python
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
# Create topics
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)
```
After having trained our model, we can use `find_topics` to search for topics that are similar
to an input search_term. Here, we are going to be searching for topics that closely relate the
search term "motor". Then, we extract the most similar topic and check the results:
```python
>>> similar_topics, similarity = topic_model.find_topics("motor", top_n=5)
>>> topic_model.get_topic(similar_topics[0])
[('bike', 0.02275997701645559),
('motorcycle', 0.011391202866080292),
('bikes', 0.00981187573649205),
('dod', 0.009614623748226669),
('honda', 0.008247663662558535),
('ride', 0.0064683227888861945),
('harley', 0.006355502638631013),
('riding', 0.005766601561614182),
('motorcycles', 0.005596372493714447),
('advice', 0.005534544418830091)]
```
It definitely seems that a topic was found that closely matches "motor". The topic seems to be motorcycle
related and therefore matches our "motor" input. You can use the `similarity` variable to see how similar
the extracted topics are to the search term.
!!! note
You can only use this method if an embedding model was supplied to BERTopic using `embedding_model`.
@@ -0,0 +1,59 @@
When performing Topic Modeling, you are often faced with data that you are familiar with to a certain extend or that speaks a very specific language. In those cases, topic modeling techniques might have difficulties capturing and representing the semantic nature of domain specific abbreviations, slang, short form, acronyms, etc. For example, the *"TNM"* classification is a method for identifying the stage of most cancers. The word *"TNM"* is an abbreviation and might not be correctly captured in generic embedding models.
To make sure that certain domain specific words are weighted higher and are more often used in topic representations, you can set any number of `seed_words` in the `bertopic.vectorizer.ClassTfidfTransformer`. The `ClassTfidfTransformer` is the base representation of BERTopic and essentially represents each topic as a bag of words. As such, we can choose to increase the importance of certain words, such as *"TNM"*.
To do so, let's take a look at an example. We have a dataset of article abstracts and want to perform some topic modeling. Since we might be familiar with the data, there are certain words that we know should be generally important. Let's assume that we have in-depth knowledge about reinforcement learning and know that words like "agent" and "robot" should be important in such a topic were it to be found. Using the `ClassTfidfTransformer`, we can define those `seed_words` and also choose by how much their values are multiplied.
The full example is then as follows:
```python
from umap import UMAP
from datasets import load_dataset
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
# Let's take a subset of ArXiv abstracts as the training data
dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]
abstracts = dataset["abstract"][:5_000]
# For illustration purposes, we make sure the output is fixed when running this code multiple times
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# We can choose any number of seed words for which we want their representation
# to be strengthen. We increase the importance of these words as we want them to be more
# likely to end up in the topic representations.
ctfidf_model = ClassTfidfTransformer(
seed_words=["agent", "robot", "behavior", "policies", "environment"],
seed_multiplier=2
)
# We run the topic model with the seeded words
topic_model = BERTopic(
umap_model=umap_model,
min_topic_size=15,
ctfidf_model=ctfidf_model,
).fit(abstracts)
```
Then, when we run `topic_model.get_topic(0)`, we get the following output:
```python
[('policy', 0.023413102511982354),
('reinforcement', 0.021796126795834238),
('agent', 0.021131601305431902),
('policies', 0.01888385271486409),
('environment', 0.017819874593917057),
('learning', 0.015321710504308708),
('robot', 0.013881115279230468),
('control', 0.013297705894983875),
('the', 0.013247933839985382),
('to', 0.013058208312484141)]
```
As we can see, the output includes some of the seed words that we assigned. However, if a word is not found to be important in a topic than we can still multiply its importance but it will remain relatively low. This is a great feature as it allows you to improve their importance with less risk of making words important in topics that really should not be.
A benefit of this method is that this often influences all other representation methods, like KeyBERTInspired and OpenAI. The reason for this is that each representation model uses the words generated by the `ClassTfidfTransformer` as candidate words to be further optimized. In many cases, words like *"TNM"* might not end up in the candidate words. By increasing their importance, they are more likely to end up as candidate words in representation models.
Another benefit of using this method is that it artificially increases the interpretability of topics. Sure, some words might be more important than others but there might not mean something to a domain expert. For them, certain words, like *"TNM"* are highly descriptive and that is something difficult to capture using any method (embedding model, large language model, etc.).
Moreover, these `seed_words` can be defined together with the domain expert as they can decide what type of words are generally important and might need a nudge from you the algorithmic developer.
@@ -0,0 +1,88 @@
In BERTopic, you have several options to nudge the creation of topics toward certain pre-specified topics. Here, we will be looking at semi-supervised topic modeling with BERTopic.
Semi-supervised modeling allows us to steer the dimensionality reduction of the embeddings into a space that closely follows any labels you might already have.
<br>
<div class="svg_image">
--8<-- "docs/getting_started/semisupervised/semisupervised.svg"
</div>
<br>
In other words, we use a semi-supervised UMAP instance to reduce the dimensionality of embeddings before clustering the documents
with HDBSCAN.
First, let us prepare the data needed for our topic model:
```python
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
docs = data["data"]
categories = data["target"]
category_names = data["target_names"]
```
We are using the popular 20 Newsgroups dataset which contains roughly 18000 newsgroups posts that each is
assigned to one of 20 categories. Using this dataset we can try to extract its corresponding topic model whilst
taking its underlying categories into account. These categories are here the variable `targets`.
Each document can be put into one of the following categories:
```python
>>> category_names
['alt.atheism',
'comp.graphics',
'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware',
'comp.windows.x',
'misc.forsale',
'rec.autos',
'rec.motorcycles',
'rec.sport.baseball',
'rec.sport.hockey',
'sci.crypt',
'sci.electronics',
'sci.med',
'sci.space',
'soc.religion.christian',
'talk.politics.guns',
'talk.politics.mideast',
'talk.politics.misc',
'talk.religion.misc']
```
To perform this semi-supervised approach, we can take in some pre-defined topics and simply pass those to the `y` parameter when fitting BERTopic. These labels can be pre-defined topics or simply documents that you feel belong together regardless of their content. BERTopic will nudge the creation of topics toward these categories
using the pre-defined labels.
To perform supervised topic modeling, we simply use all categories:
```python
topic_model = BERTopic(verbose=True).fit(docs, y=categories)
```
The topic model will be much more attuned to the categories that were defined previously. However, this does not mean that only topics for these categories will be found. BERTopic is likely to find more specific topics in those you have already defined. This allows you to discover previously unknown topics!
## **Partial labels**
At times, you might only have labels for a subset of documents. Fortunately, we can still use those labels to at least nudge the documents for which those labels exist. The documents for which we do not have labels are assigned a -1. For this example, imagine we only have the labels of categories that are related to computers and we want to create a topic model using semi-supervised modeling:
```python
labels_to_add = ['comp.graphics', 'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
'comp.windows.x',]
indices = [category_names.index(label) for label in labels_to_add]
y = [label if label in indices else -1 for label in categories]
```
The `y` variable contains many -1 values since we do not know all the categories.
Next, we use those newly constructed labels to again BERTopic semi-supervised:
```python
topic_model = BERTopic(verbose=True).fit(docs, y=y)
```
And that is it! By defining certain classes for our documents, we can steer the topic modeling towards modeling the pre-defined categories.
@@ -0,0 +1,21 @@
<svg width="534" height="135" viewBox="0 0 534 135" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect width="534" height="57" fill="white"/>
<rect x="0.5" y="14.5" width="88" height="42" fill="white" stroke="black"/>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="30" y="10.9697">SBERT</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="183" y="10.9697">UMAP</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="313" y="10.9697">HDBSCAN</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="468" y="10.9697">c-TF-IDF</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="9" y="38.7637">Embeddings</tspan></text>
<rect x="142.5" y="14.5" width="105" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="156.094" y="33.7637">Dimensionality &#10;</tspan><tspan x="171.762" y="47.7637">reduction</tspan></text>
<rect x="162.5" y="104.5" width="62" height="30" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="176.336" y="123.764">Labels</tspan></text>
<path d="M126.707 33.7071C127.098 33.3166 127.098 32.6834 126.707 32.2929L120.343 25.9289C119.953 25.5384 119.319 25.5384 118.929 25.9289C118.538 26.3195 118.538 26.9526 118.929 27.3431L124.586 33L118.929 38.6569C118.538 39.0474 118.538 39.6805 118.929 40.0711C119.319 40.4616 119.953 40.4616 120.343 40.0711L126.707 33.7071ZM99 34H126V32H99V34Z" fill="black"/>
<rect x="295.5" y="14.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="317" y="38.7637">Clustering</tspan></text>
<path d="M285.707 33.7071C286.098 33.3166 286.098 32.6834 285.707 32.2929L279.343 25.9289C278.953 25.5384 278.319 25.5384 277.929 25.9289C277.538 26.3195 277.538 26.9526 277.929 27.3431L283.586 33L277.929 38.6569C277.538 39.0474 277.538 39.6805 277.929 40.0711C278.319 40.4616 278.953 40.4616 279.343 40.0711L285.707 33.7071ZM258 34H285V32H258V34Z" fill="black"/>
<rect x="442.5" y="14.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="472.404" y="30.7637">Topic &#10;</tspan><tspan x="450.215" y="44.7637">representation</tspan></text>
<path d="M426.707 33.7071C427.098 33.3166 427.098 32.6834 426.707 32.2929L420.343 25.9289C419.953 25.5384 419.319 25.5384 418.929 25.9289C418.538 26.3195 418.538 26.9526 418.929 27.3431L424.586 33L418.929 38.6569C418.538 39.0474 418.538 39.6805 418.929 40.0711C419.319 40.4616 419.953 40.4616 420.343 40.0711L426.707 33.7071ZM399 34H426V32H399V34Z" fill="black"/>
<path d="M194.707 66.2929C194.317 65.9024 193.683 65.9024 193.293 66.2929L186.929 72.6569C186.538 73.0474 186.538 73.6805 186.929 74.0711C187.319 74.4616 187.953 74.4616 188.343 74.0711L194 68.4142L199.657 74.0711C200.047 74.4616 200.681 74.4616 201.071 74.0711C201.462 73.6805 201.462 73.0474 201.071 72.6569L194.707 66.2929ZM195 94L195 67L193 67L193 94L195 94Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 3.6 KiB

@@ -0,0 +1,144 @@
Saving, loading, and sharing a BERTopic model can be done in several ways. It is generally advised to go with `.safetensors` as that allows for a small, safe, and fast method for saving your BERTopic model. However, other formats, such as `.pickle` and pytorch `.bin` are also possible.
## **Saving**
There are three methods for saving BERTopic:
1. A light model with `.safetensors` and config files
2. A light model with pytorch `.bin` and config files
3. A full model with `.pickle`
!!! Tip "Tip"
It is advised to use methods 1 or 2 for saving as they generated very small models. Especially method 1 (`safetensors`)
allows for a relatively safe format compared to the other methods.
The methods are used as follows:
```python
topic_model = BERTopic().fit(my_docs)
# Method 1 - safetensors
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("path/to/my/model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)
# Method 2 - pytorch
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("path/to/my/model_dir", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)
# Method 3 - pickle
topic_model.save("my_model", serialization="pickle")
```
!!! Warning "Warning"
When saving the model, make sure to also keep track of the versions of dependencies and Python used.
Loading and saving the model should be done using the same dependencies and Python. Moreover, models
saved in one version of BERTopic are not guaranteed to load in other versions.
### **Pickle Drawbacks**
Saving the model with `pickle` allows for saving the entire topic model, including dimensionality reduction and clustering algorithms, but has several drawbacks:
* Arbitrary code can be run from `.pickle` files
* The resulting model is rather large (often > 500MB) since all sub-models need to be saved
* Explicit and specific version control is needed as they typically only run if the environment is exactly the same
### **Safetensors and Pytorch Advantages**
Saving the topic modeling with `.safetensors` or `pytorch` has a number of advantages:
* `.safetensors` is a relatively **safe format**
* The resulting model can be **very small** (often < 20MB>) since no sub-models need to be saved
* Although version control is important, there is a bit more **flexibility** with respect to specific versions of packages
* More easily used in **production**
* **Share** models with the HuggingFace Hub
<br><br>
<img src="serialization.png">
<br><br>
The above image, a model trained on 100,000 documents, demonstrates the differences in sizes comparing `safetensors`, `pytorch`, and `pickle`. The difference in sizes can mostly be explained due to the efficient saving procedure and that the clustering and dimensionality reductions are not saved in safetensors/pytorch since inference can be done based on the topic embeddings.
## **HuggingFace Hub**
When you have created a BERTopic model, you can easily share it with other through the HuggingFace Hub. First, you need to log in to your HuggingFace account which you can do in a number of ways:
* Log in to your Hugging Face account with the command below
```bash
huggingface-cli login
# or using an environment variable
huggingface-cli login --token $HUGGINGFACE_TOKEN
```
* Alternatively, you can programmatically login using login() in a notebook or a script
```python
from huggingface_hub import login
login()
```
* Or you can give a token with the `token` variable
When you have logged in to your HuggingFace account, you can save and upload the model as follows:
```python
from bertopic import BERTopic
# Train model
topic_model = BERTopic().fit(my_docs)
# Push to HuggingFace Hub
topic_model.push_to_hf_hub(
repo_id="MaartenGr/BERTopic_ArXiv",
save_ctfidf=True
)
# Load from HuggingFace
loaded_model = BERTopic.load("MaartenGr/BERTopic_ArXiv")
```
### **Parameters**
There are number of parameters that may be worthwhile to know:
* `private`
* Whether to create a private repository
* `serialization`
* The type of serialization. Either `safetensors` or `pytorch`. Make sure to run `pip install safetensors` for safetensors.
* `save_embedding_model`
* A pointer towards a HuggingFace model to be loaded in with SentenceTransformers. E.g., `sentence-transformers/all-MiniLM-L6-v2`
* `save_ctfidf`
* Whether to save c-TF-IDF information
## **Loading**
To load a model:
```python
# Load from directory
loaded_model = BERTopic.load("path/to/my/model_dir")
# Load from file
loaded_model = BERTopic.load("my_model")
# Load from HuggingFace
loaded_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")
```
The embedding model cannot always be saved using a non-pickle method if, for example, you are using OpenAI embeddings. Instead, you can load them in as follows:
```python
# Define embedding model
import openai
from bertopic.backend import OpenAIBackend
client = openai.OpenAI(api_key="sk-...")
embedding_model = OpenAIBackend(client, "text-embedding-ada-002")
# Load model and add embedding model
loaded_model = BERTopic.load("path/to/my/model_dir", embedding_model=embedding_model)
```
Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

@@ -0,0 +1,14 @@
<svg width="387" height="56" viewBox="0 0 387 56" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect width="387" height="56" fill="white"/>
<rect x="0.5" y="13.5" width="88" height="42" fill="white" stroke="black"/>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="30" y="9.96973">SBERT</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="150" y="9.96973">Logistic Regression</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="320" y="9.96973">c-TF-IDF</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="9" y="37.7637">Embeddings</tspan></text>
<rect x="142.5" y="13.5" width="105" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="173.045" y="37.7637">Classifier</tspan></text>
<path d="M126.707 32.7071C127.098 32.3166 127.098 31.6834 126.707 31.2929L120.343 24.9289C119.953 24.5384 119.319 24.5384 118.929 24.9289C118.538 25.3195 118.538 25.9526 118.929 26.3431L124.586 32L118.929 37.6569C118.538 38.0474 118.538 38.6805 118.929 39.0711C119.319 39.4616 119.953 39.4616 120.343 39.0711L126.707 32.7071ZM99 33H126V31H99V33Z" fill="black"/>
<rect x="295.5" y="13.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="327.279" y="31.7637">Topic&#10;</tspan><tspan x="303.215" y="45.7637">representation</tspan></text>
<path d="M285.707 32.7071C286.098 32.3166 286.098 31.6834 285.707 31.2929L279.343 24.9289C278.953 24.5384 278.319 24.5384 277.929 24.9289C277.538 25.3195 277.538 25.9526 277.929 26.3431L283.586 32L277.929 37.6569C277.538 38.0474 277.538 38.6805 277.929 39.0711C278.319 39.4616 278.953 39.4616 279.343 39.0711L285.707 32.7071ZM258 33H285V31H258V33Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 2.2 KiB

@@ -0,0 +1,18 @@
<svg width="534" height="57" viewBox="0 0 534 57" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect width="534" height="57" fill="white"/>
<rect x="0.5" y="14.5" width="88" height="42" fill="white" stroke="black"/>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="30" y="10.9697">SBERT</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="183" y="10.9697">UMAP</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="313" y="10.9697">HDBSCAN</tspan></text>
<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="468" y="10.9697">c-TF-IDF</tspan></text>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="9" y="38.7637">Embeddings</tspan></text>
<rect x="142.5" y="14.5" width="105" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="156.094" y="33.7637">Dimensionality &#10;</tspan><tspan x="171.762" y="47.7637">reduction</tspan></text>
<path d="M126.707 33.7071C127.098 33.3166 127.098 32.6834 126.707 32.2929L120.343 25.9289C119.953 25.5384 119.319 25.5384 118.929 25.9289C118.538 26.3195 118.538 26.9526 118.929 27.3431L124.586 33L118.929 38.6569C118.538 39.0474 118.538 39.6805 118.929 40.0711C119.319 40.4616 119.953 40.4616 120.343 40.0711L126.707 33.7071ZM99 34H126V32H99V34Z" fill="black"/>
<rect x="295.5" y="14.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="317" y="38.7637">Clustering</tspan></text>
<path d="M285.707 33.7071C286.098 33.3166 286.098 32.6834 285.707 32.2929L279.343 25.9289C278.953 25.5384 278.319 25.5384 277.929 25.9289C277.538 26.3195 277.538 26.9526 277.929 27.3431L283.586 33L277.929 38.6569C277.538 39.0474 277.538 39.6805 277.929 40.0711C278.319 40.4616 278.953 40.4616 279.343 40.0711L285.707 33.7071ZM258 34H285V32H258V34Z" fill="black"/>
<rect x="442.5" y="14.5" width="91" height="42" fill="white" stroke="black"/>
<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="472.404" y="30.7637">Topic &#10;</tspan><tspan x="450.215" y="44.7637">representation</tspan></text>
<path d="M426.707 33.7071C427.098 33.3166 427.098 32.6834 426.707 32.2929L420.343 25.9289C419.953 25.5384 419.319 25.5384 418.929 25.9289C418.538 26.3195 418.538 26.9526 418.929 27.3431L424.586 33L418.929 38.6569C418.538 39.0474 418.538 39.6805 418.929 40.0711C419.319 40.4616 419.953 40.4616 420.343 40.0711L426.707 33.7071ZM399 34H426V32H399V34Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 3.0 KiB

@@ -0,0 +1,16 @@
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 141.29942040537162 108.05440778228649" width="423.89826121611486" height="324.16322334685947">
<!-- svg-source:excalidraw -->
<defs>
<style class="style-fonts">
@font-face {
font-family: "Virgil";
src: url("https://excalidraw.com/Virgil.woff2");
}
@font-face {
font-family: "Cascadia";
src: url("https://excalidraw.com/Cascadia.woff2");
}
</style>
</defs>
<g stroke-linecap="round" transform="translate(10 10) rotate(0 60.64971020268581 44.027203891143245)"><path d="M-2.73 1.92 C24.06 1.09, 56.8 -3.83, 119.99 -2.47 M1.8 1.6 C39.19 0.7, 76.15 0.87, 122.58 -0.36 M123.19 3.98 C118.93 26.12, 126.15 49.95, 118.13 84.42 M120.17 0 C120.71 33.24, 123.86 64.82, 123.23 86.14 M123.15 86.29 C93.9 84.44, 73.27 85.17, -2.56 87.54 M119.47 89.23 C82.71 86.5, 39.34 85.91, 1.55 89.48 M-1.53 87.36 C-2.82 58.23, -1.26 26.16, 1.97 0.48 M-0.96 87.6 C3.28 54.37, 0.14 24.21, 1.59 -0.59" stroke="#000000" stroke-width="1" fill="none"></path></g></svg>

After

Width:  |  Height:  |  Size: 1.0 KiB

@@ -0,0 +1,120 @@
Although topic modeling is typically done by discovering topics in an unsupervised manner, there might be times when you already have a bunch of clusters or classes from which you want to model the topics. For example, the often used [20 NewsGroups dataset](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) is already split up into 20 classes. Similarly, you might already have created some labels yourself through packages like [human-learn](https://github.com/koaning/human-learn), [bulk](https://github.com/koaning/bulk), [thisnotthat](https://github.com/TutteInstitute/thisnotthat) or something entirely different.
Instead of using BERTopic to discover previously unknown topics, we are now going to manually pass them to BERTopic and try to learn the relationship between those topics and the input documents.
> In other words, we are going to be performing classification instead!
We can view this as a supervised topic modeling approach. Instead of using a clustering algorithm, we are going to be using a classification algorithm instead.
Generally, we have the following pipeline:
<br>
<div class="svg_image">
--8<-- "docs/getting_started/supervised/default_pipeline.svg"
</div>
<br>
Instead, we are now going to skip over the dimensionality reduction step and replace the clustering step with a classification model:
<br>
<div class="svg_image">
--8<-- "docs/getting_started/supervised/classification_pipeline.svg"
</div>
<br>
In other words, we can pass our labels to BERTopic and it will not only learn how to predict labels for new instances, but it also transforms those labels into topics by running the c-TF-IDF representations on the set of documents within each label. This process allows us to model the topics themselves and similarly gives us the option to use everything BERTopic has to offer.
To do so, we need to skip over the dimensionality reduction step and replace the clustering step with a classification algorithm. We can use the documents and labels from the 20 NewsGroups dataset to create topics from those 20 labels:
```python
from sklearn.datasets import fetch_20newsgroups
# Get labeled data
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
docs = data['data']
y = data['target']
```
Then, we make sure to create empty instances of the dimensionality reduction and clustering steps. We pass those to BERTopic to simply skip over them and go to the topic representation process:
```python
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression
# Get labeled data
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
docs = data['data']
y = data['target']
# Skip over dimensionality reduction, replace cluster model with classifier,
# and reduce frequent words while we are at it.
empty_dimensionality_model = BaseDimensionalityReduction()
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# Create a fully supervised BERTopic instance
topic_model= BERTopic(
umap_model=empty_dimensionality_model,
hdbscan_model=clf,
ctfidf_model=ctfidf_model
)
topics, probs = topic_model.fit_transform(docs, y=y)
```
Let's take a look at a few topics that we get out of training this way by running `topic_model.get_topic_info()`:
<br>
<div class="svg_image">
--8<-- "docs/getting_started/supervised/table.svg"
</div>
<br>
We can see several interesting topics appearing here. They seem to relate to the 20 classes we had as input. Now, let's map those topics to our original classes to view their relationship:
```python
# Map input `y` to topics
mappings = topic_model.topic_mapper_.get_mappings()
mappings = {value: data["target_names"][key] for key, value in mappings.items()}
# Assign original classes to our topics
df = topic_model.get_topic_info()
df["Class"] = df.Topic.map(mappings)
df
```
<div class="svg_image">
--8<-- "docs/getting_started/supervised/table_classes.svg"
</div>
<br>
We can see that the c-TF-IDF representations extract the words that give a good representation of our input classes. This is all done directly from the labeling. A welcome side-effect is that we now have a classification algorithm that allows us to predict the topics of unseen data:
```python
>>> topic, _ = topic_model.transform("this is a document about cars")
>>> topic_model.get_topic(topic)
[('car', 0.4407600315538472),
('cars', 0.32348015696446325),
('engine', 0.28032518444946686),
('ford', 0.2500224508115155),
('oil', 0.2325984913598611),
('dealer', 0.2310723968585826),
('my', 0.22045777551991935),
('it', 0.21327993649430219),
('tires', 0.20420842634292657),
('brake', 0.20246902481367085)]
```
Moreover, we can still perform BERTopic-specific features like dynamic topic modeling, topics per class, hierarchical topic modeling, modeling topic distributions, etc.
!!! note
The resulting `topics` may be a different mapping from the `y` labels. To map `y` to `topics`, we can run the following:
```python
mappings = topic_model.topic_mapper_.get_mappings()
y_mapped = [mappings[val] for val in y]
```

Some files were not shown because too many files have changed in this diff Show More