89 lines
2.5 KiB
YAML
89 lines
2.5 KiB
YAML
|
# Name of this instance
|
||
|
# Default value: atextcrawler
|
||
|
# Allowed values: arbitrary string
|
||
|
instance_name: atextcrawler
|
||
|
|
||
|
# Which kind of instance is this?
|
||
|
# Default value: prod
|
||
|
# Allowed values are:
|
||
|
# - 'dev': development instance
|
||
|
# - 'staging': staging instance
|
||
|
# - 'prod': production instance
|
||
|
instance_type: prod
|
||
|
|
||
|
# Log level
|
||
|
# Default value: info
|
||
|
# Allowed values: critical, error, warning, info, debug
|
||
|
log_level: info
|
||
|
|
||
|
# Plugins directory
|
||
|
# If given as relative path, it will be relative to the
|
||
|
# directory of this file (main.yaml).
|
||
|
# Read documentation on plugins.
|
||
|
# Default value: plugins
|
||
|
# Hint: Create a empty __init__.py in the plugins_dir.
|
||
|
plugins_dir: plugins
|
||
|
|
||
|
# Parameters for access to the PostgreSQL service
|
||
|
# No default values; must be set.
|
||
|
postgresql:
|
||
|
host: localhost
|
||
|
port: 5432
|
||
|
database: atextcrawler
|
||
|
user: atextcrawler
|
||
|
password: ________________________
|
||
|
|
||
|
# Crawling
|
||
|
crawl:
|
||
|
# Number of concurrent workers
|
||
|
# Default value: 10
|
||
|
# Allowed values: integer >=0 and <=1000
|
||
|
#workers: 3
|
||
|
|
||
|
# Delay in seconds between attempts to fetch items
|
||
|
# from site_queue if the last attempt gave no item
|
||
|
# Also the delay in seconds after a worker has found
|
||
|
# no site to process
|
||
|
# Default value: 600
|
||
|
# Allowed values: positive number
|
||
|
#site_delay: 10
|
||
|
|
||
|
# Time interval in seconds between site updates when
|
||
|
# handling queued base URLs
|
||
|
# Default value: 3600
|
||
|
# Allowed values: positive number
|
||
|
#site_revisit_interval: 3600
|
||
|
|
||
|
# Delay in seconds between attempts to process
|
||
|
# individual resources (pages etc.) of a site
|
||
|
# Default value: 5
|
||
|
# Allowed values: positive number
|
||
|
#resource_delay: 3
|
||
|
|
||
|
# Default interval in seconds between full crawls of a site
|
||
|
# Default value: 864000 (10 days)
|
||
|
# Allowed values: positive number
|
||
|
#full_crawl_interval: 864000
|
||
|
|
||
|
# Default interval in seconds between feed crawls of a site
|
||
|
# Default value: 86400 (1 day)
|
||
|
# Allowed values: positive number
|
||
|
#feed_crawl_interval: 86400
|
||
|
|
||
|
# Parameters for access to the ElasticSearch service
|
||
|
# No default values; must be set.
|
||
|
elasticsearch:
|
||
|
# host on which ES is running
|
||
|
host: localhost
|
||
|
# API key for accessing ES
|
||
|
api_key: "**********************"
|
||
|
# API user id
|
||
|
id: "**********************"
|
||
|
# Index base name (actual index names will have '_text' etc. appended)
|
||
|
index_base_name: atext
|
||
|
|
||
|
# Tensorflow access
|
||
|
tensorflow:
|
||
|
# The prediction endpoint of the model server's sentence model
|
||
|
model_server_endpoint: http://localhost:9000/v1/models/sentences:predict
|